milupHPC documentation
  • src
  • subdomain_key_tree
subdomain.cu
Go to the documentation of this file.
1#include "../../include/subdomain_key_tree/subdomain.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
3#include <cub/cub.cuh>
4
5CUDA_CALLABLE_MEMBER void KeyNS::key2Char(keyType key, integer maxLevel, char *keyAsChar) {
6 int level[21];
7 for (int i=0; i<maxLevel; i++) {
8 level[i] = (int)(key >> (maxLevel*DIM - DIM*(i+1)) & (int)(POW_DIM - 1));
9 }
10 for (int i=0; i<=maxLevel; i++) {
11 keyAsChar[2*i] = level[i] + '0';
12 keyAsChar[2*i+1] = '|';
13 }
14 keyAsChar[2*maxLevel+3] = '\0';
15}
16
17CUDA_CALLABLE_MEMBER integer KeyNS::key2proc(keyType key, SubDomainKeyTree *subDomainKeyTree/*, Curve::Type curveType*/) {
18 return subDomainKeyTree->key2proc(key); //, curveType);
19}
20
21CUDA_CALLABLE_MEMBER SubDomainKeyTree::SubDomainKeyTree() {
22
23}
24
25CUDA_CALLABLE_MEMBER SubDomainKeyTree::SubDomainKeyTree(integer rank, integer numProcesses, keyType *range,
26 integer *procParticleCounter) : rank(rank),
27 numProcesses(numProcesses), range(range),
28 procParticleCounter(procParticleCounter) {
29
30}
31
32CUDA_CALLABLE_MEMBER SubDomainKeyTree::~SubDomainKeyTree() {
33
34}
35
36CUDA_CALLABLE_MEMBER void SubDomainKeyTree::set(integer rank, integer numProcesses, keyType *range,
37 integer *procParticleCounter) {
38 this->rank = rank;
39 this->numProcesses = numProcesses;
40 this->range = range;
41 this->procParticleCounter = procParticleCounter;
42}
43
44//switch (curveType) {
45// case Curve::lebesgue: {
46// for (integer proc = 0; proc < numProcesses; proc++) {
47// if (key >= range[proc] && key < range[proc + 1]) {
48// return proc;
49// }
50// }
51// }
52// case Curve::hilbert: {
53//
54// keyType hilbert = Lebesgue2Hilbert(key, 21);
55// for (int proc = 0; proc < s->numProcesses; proc++) {
56// if (hilbert >= s->range[proc] && hilbert < s->range[proc + 1]) {
57// return proc;
58// }
59// }
60//
61// }
62// default: {
63// printf("Curve type not available!\n");
64// }
65//}
66
67// ATTENTION: independent of lebesgue/hilbert, thus provide appropriate key!
68CUDA_CALLABLE_MEMBER integer SubDomainKeyTree::key2proc(keyType key/*, Curve::Type curveType*/) {
69
70 for (integer proc = 0; proc < numProcesses; proc++) {
71 if (key >= range[proc] && key < range[proc + 1]) {
72 return proc;
73 }
74 }
75 printf("ERROR: key2proc(k=%lu): -1!\n", key);
76 return -1;
77}
78
79CUDA_CALLABLE_MEMBER bool SubDomainKeyTree::isDomainListNode(keyType key, integer maxLevel, integer level,
80 Curve::Type curveType) {
81 integer p1, p2;
82 //p1 = key2proc(key);
83 //p2 = key2proc(key | ~(~0UL << DIM * (maxLevel - level)));
84
85 switch (curveType) {
86 case Curve::lebesgue: {
87 p1 = key2proc(key);
88 p2 = key2proc(key | ~(~0UL << DIM * (maxLevel - level)));
89 break;
90 }
91 case Curve::hilbert: {
92 //p1 = key2proc(KeyNS::lebesgue2hilbert(key, maxLevel));
93 //p2 = key2proc(KeyNS::lebesgue2hilbert(key | ~(~0UL << DIM * (maxLevel - level)), maxLevel));
94
95 p1 = key2proc(KeyNS::lebesgue2hilbert(key, maxLevel, level));
96 //p2 = key2proc(KeyNS::lebesgue2hilbert(key | ~(~0UL << DIM * (maxLevel - level)), maxLevel, maxLevel));
97 keyType hilbert = KeyNS::lebesgue2hilbert(key, maxLevel, level);
98#if DIM == 1
99 keyType shiftValue = 1;
100 keyType toShift = 21;
101 keyType keyMax = (shiftValue << toShift);
102#elif DIM == 2
103 keyType shiftValue = 1;
104 keyType toShift = 42;
105 keyType keyMax = (shiftValue << toShift);
106#else
107 //keyType shiftValue = 1;
108 //keyType toShift = 63;
109 //keyType keyMax = (shiftValue << toShift);
110 keyType keyMax = ~0UL; //KEY_MAX;
111#endif
112 //p2 = key2proc(hilbert | (KEY_MAX >> (DIM*level+1)));
113 p2 = key2proc(hilbert | (keyMax >> (DIM*level+1)));
114
115 //printf("lebesgue: %lu vs %lu < ? : %i\n", key, key | ~(~0UL << DIM * (maxLevel - level)),
116 // key < (key | ~(~0UL << DIM * (maxLevel - level))));
117 //printf("hilbert: %lu vs %lu < ? : %i\n", KeyNS::lebesgue2hilbert(key, maxLevel, maxLevel),
118 // hilbert | (KEY_MAX >> (DIM*level+1)),
119 //KeyNS::lebesgue2hilbert(key, maxLevel, maxLevel) < (hilbert | (KEY_MAX >> (DIM*level+1))));
120
121 break;
122 }
123 default: {
124 printf("Curve type not available!\n");
125 }
126 }
127 if (p1 != p2) {
128 return true;
129 }
130 return false;
131}
132
133namespace SubDomainKeyTreeNS {
134
135 namespace Kernel {
136
137 __global__ void set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range,
138 integer *procParticleCounter) {
139 subDomainKeyTree->set(rank, numProcesses, range, procParticleCounter);
140 }
141
142 __global__ void test(SubDomainKeyTree *subDomainKeyTree) {
143 printf("device: subDomainKeyTree->rank = %i\n", subDomainKeyTree->rank);
144 printf("device: subDomainKeyTree->numProcesses = %i\n", subDomainKeyTree->numProcesses);
145 }
146
147 // "serial" version
148 __global__ void buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m) {
149
150 integer domainListCounter = 0;
151
152 integer path[MAX_LEVEL];
153
154 real min_x, max_x;
155#if DIM > 1
156 real min_y, max_y;
157#if DIM == 3
158 real min_z, max_z;
159#endif
160#endif
161 integer currentChild;
162 integer childPath;
163 bool insert = true;
164
165 integer childIndex;
166 integer temp;
167
168 // loop over domain list indices (over the keys found/generated by createDomainListKernel)
169 for (int i = 0; i < *domainList->domainListIndex; i++) {
170 childIndex = 0;
171 // iterate through levels (of corresponding domainListIndex)
172 for (int j = 0; j < domainList->domainListLevels[i]; j++) {
173 path[j] = (integer) (domainList->domainListKeys[i] >> (MAX_LEVEL * DIM - DIM * (j + 1)) &
174 (integer)(POW_DIM - 1));
175 temp = childIndex;
176 childIndex = tree->child[POW_DIM*childIndex+path[j]];//tree->child[POW_DIM*childIndex + path[j]];
177 if (childIndex < n) {
178 if (childIndex == -1) {
179 // no child at all here, thus add node
180 integer cell = atomicAdd(tree->index, 1);
181 tree->child[POW_DIM * temp + path[j]] = cell;
182 particles->level[cell] = j + 1; //particles->level[temp] + 1;
183 childIndex = cell;
184 domainList->domainListIndices[domainListCounter] = childIndex; //cell;
185 particles->nodeType[childIndex] = 1;
186#if DEBUGGING
187#if DIM == 3
188 printf("adding node index %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i\n",
189 temp, cell, path[j], j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
190 particles->level[cell]);
191#endif
192#endif
193 domainListCounter++;
194 } else {
195 // child is a leaf, thus add node in between
196 integer cell = atomicAdd(tree->index, 1);
197 tree->child[POW_DIM * temp + path[j]] = cell;
198
199 min_x = *tree->minX;
200 max_x = *tree->maxX;
201#if DIM > 1
202 min_y = *tree->minY;
203 max_y = *tree->maxY;
204#if DIM == 3
205 min_z = *tree->minZ;
206 max_z = *tree->maxZ;
207#endif
208#endif
209
210 for (int k=0; k<=j; k++) {
211
212 currentChild = path[k];
213
214 if (currentChild % 2 != 0) {
215 max_x = 0.5 * (min_x + max_x);
216 currentChild -= 1;
217 }
218 else {
219 min_x = 0.5 * (min_x + max_x);
220 }
221#if DIM > 1
222 if (currentChild % 2 == 0 && currentChild % 4 != 0) {
223 max_y = 0.5 * (min_y + max_y);
224 currentChild -= 2;
225 }
226 else {
227 min_y = 0.5 * (min_y + max_y);
228 }
229#if DIM == 3
230 if (currentChild == 4) {
231 max_z = 0.5 * (min_z + max_z);
232 currentChild -= 4;
233 }
234 else {
235 min_z = 0.5 * (min_z + max_z);
236 }
237#endif
238#endif
239 }
240 // insert old/original particle
241 childPath = 0; //(int) (domainListKeys[i] >> (21 * 3 - 3 * ((j+1) + 1)) & (int)7); //0; //currentChild; //0;
242 if (particles->x[childIndex] < 0.5 * (min_x + max_x)) {
243 childPath += 1;
244 //max_x = 0.5 * (min_x + max_x);
245 }
246 //else { min_x = 0.5 * (min_x + max_x); }
247#if DIM > 1
248 if (particles->y[childIndex] < 0.5 * (min_y + max_y)) {
249 childPath += 2;
250 //max_y = 0.5 * (min_y + max_y);
251 }
252 //else { min_y = 0.5 * (min_y + max_y); }
253#if DIM == 3
254 if (particles->z[childIndex] < 0.5 * (min_z + max_z)) {
255 childPath += 4;
256 //max_z = 0.5 * (min_z + max_z);
257 }
258 //else { min_z = 0.5 * (min_z + max_z); }
259#endif
260#endif
261
262 //particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
263 particles->x[cell] = particles->x[childIndex];
264#if DIM > 1
265 //particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
266 particles->y[cell] = particles->y[childIndex];
267#if DIM == 3
268 //particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
269 particles->z[cell] = particles->z[childIndex];
270#endif
271#endif
272 //particles->mass[cell] += particles->mass[childIndex];
273 particles->mass[cell] = particles->mass[childIndex];
274
275 particles->level[cell] = particles->level[childIndex];
276 particles->level[childIndex] += 1;
277#if DEBUGGING
278#if DIM == 3
279 printf("adding node in between for index %i, temp = %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i vs %i\n",
280 childIndex, temp, cell, childPath, j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
281 particles->level[cell], particles->level[childIndex]);
282#endif
283#endif
284
285 tree->child[POW_DIM * cell + childPath] = childIndex;
286
287 childIndex = cell;
288 domainList->domainListIndices[domainListCounter] = childIndex; //temp;
289 particles->nodeType[childIndex] = 1;
290 domainListCounter++;
291 }
292 }
293 else {
294 insert = true;
295 // check whether node already marked as domain list node
296 if (particles->nodeType[childIndex] >= 1) {
297 insert = false;
298 }
299 //for (int k=0; k<domainListCounter; k++) {
300 // if (childIndex == domainList->domainListIndices[k]) {
301 // insert = false;
302 // break;
303 // }
304 //}
305 if (insert) {
306 // mark/save node as domain list node
307 domainList->domainListIndices[domainListCounter] = childIndex; //temp;
308 particles->nodeType[childIndex] = 1;
309 domainListCounter++;
310 }
311 }
312 }
313 }
314 }
315
316 /*
317 // parallel version
318 __global__ void buildDomainTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
319 DomainList *domainList, integer n, integer m, integer level) {
320
321 integer index = threadIdx.x + blockIdx.x * blockDim.x;
322 integer stride = blockDim.x * gridDim.x;
323 integer offset = 0;
324
325 integer domainListCounter;
326
327 integer path[MAX_LEVEL];
328
329 real min_x, max_x;
330#if DIM > 1
331 real min_y, max_y;
332#if DIM == 3
333 real min_z, max_z;
334#endif
335#endif
336 integer currentChild;
337 integer childPath;
338 bool insert = true;
339
340 integer childIndex;
341 integer temp;
342 int j;
343
344 // loop over domain list indices (over the keys found/generated by createDomainListKernel)
345 while ((index + offset) < *domainList->domainListIndex) {
346
347 for (j = 0; j < MAX_LEVEL; j++) {
348 path[j] = 0;
349 }
350 if (domainList->domainListLevels[index + offset] == level) {
351 //printf("domainListKey[%i] = %lu\n", i, domainList->domainListKeys[i]);
352 childIndex = 0;
353 temp = 0;
354 // iterate through levels (of corresponding domainListIndex)
355 for (j = 0; j < domainList->domainListLevels[index + offset]; j++) {
356 path[j] = (integer) (
357 domainList->domainListKeys[index + offset] >> (MAX_LEVEL * DIM - DIM * (j + 1)) &
358 (integer) (POW_DIM - 1));
359 temp = childIndex;
360 childIndex = tree->child[POW_DIM * childIndex + path[j]];
361 }
362 if (childIndex < n) {
363 if (childIndex == -1) {
364 // no child at all here, thus add node
365 printf("build domain tree, adding node ...\n");
366 integer cell = atomicAdd(tree->index, 1);
367 tree->child[POW_DIM * temp + path[j-1]] = cell;
368 particles->level[cell] = j;
369 childIndex = cell;
370 //domainListCounter = atomicAdd(domainList->domainListCounter, 1);
371 domainList->domainListIndices[index + offset] = childIndex; //cell;
372 particles->nodeType[childIndex] = 1;
373
374 min_x = *tree->minX;
375 max_x = *tree->maxX;
376#if DIM > 1
377 min_y = *tree->minY;
378 max_y = *tree->maxY;
379#if DIM == 3
380 min_z = *tree->minZ;
381 max_z = *tree->maxZ;
382#endif
383#endif
384
385 // new
386 int k;
387 for (k = 0; k < j; k++) {
388
389 currentChild = path[k];
390 //printf("[rank %i] currentChild: %i\n", subDomainKeyTree->rank, currentChild);
391 if (currentChild & 1) {
392 max_x = 0.5 * (min_x + max_x);
393 //path -= 1;
394 } else {
395 min_x = 0.5 * (min_x + max_x);
396 }
397#if DIM > 1
398 if ((currentChild >> 1) & 1) {
399 //if (path % 2 == 0 && path % 4 != 0) {
400 max_y = 0.5 * (min_y + max_y);
401 //path -= 2;
402 } else {
403 min_y = 0.5 * (min_y + max_y);
404 }
405#if DIM == 3
406 if ((currentChild >> 2) & 1) {
407 //if (path == 4) {
408 max_z = 0.5 * (min_z + max_z);
409 //path -= 4;
410 } else {
411 min_z = 0.5 * (min_z + max_z);
412 }
413#endif
414#endif
415
416 //if (currentChild % 2 != 0) {
417 // max_x = 0.5 * (min_x + max_x);
418 // currentChild -= 1;
419 //} else {
420 // min_x = 0.5 * (min_x + max_x);
421 //}
422 //#if DIM > 1
423 //if (currentChild % 2 == 0 && currentChild % 4 != 0) {
424 // max_y = 0.5 * (min_y + max_y);
425 // currentChild -= 2;
426 //} else {
427 // min_y = 0.5 * (min_y + max_y);
428 //}
429 //#if DIM == 3
430 //if (currentChild == 4) {
431 // max_z = 0.5 * (min_z + max_z);
432 // currentChild -= 4;
433 //} else {
434 // min_z = 0.5 * (min_z + max_z);
435 //}
436 //#endif
437 //#endif
438 //}
439
440 //particles->x[cell] = particles->x[childIndex];
441 particles->x[cell] = 0.5 * (min_x + max_x);
442#if DIM > 1
443 //particles->y[cell] = particles->y[childIndex];
444 particles->y[cell] = 0.5 * (min_y + max_y);
445#if DIM == 3
446 //particles->z[cell] = particles->z[childIndex];
447 particles->z[cell] = 0.5 * (min_z + max_z);
448#endif
449#endif
450 // end: new
451#if DEBUGGING
452#if DIM == 3
453 printf("adding node index %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i\n",
454 temp, cell, path[j], j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
455 particles->level[cell]);
456#endif
457#endif
458#if DIM == 3
459#if DEBUGGING
460 printf("[rank %i] adding domainListIndices[%i] = %i, childIndex = %i, path = %i\n", subDomainKeyTree->rank,
461 index + offset, domainList->domainListIndices[index + offset], childIndex, path[j-1]);
462 //printf("adding node index %i cell = %i (childPath = %i, j = %i)! x = (%f, %f, %f)\n",
463 // childIndex, cell, path[j], j, particles->x[childIndex], particles->y[childIndex],
464 // particles->z[childIndex]);
465#endif
466#endif
467 } else {
468 printf("build domain tree, adding node in between...\n");
469 // child is a leaf, thus add node in between
470 integer cell = atomicAdd(tree->index, 1);
471 tree->child[POW_DIM * temp + path[j - 1]] = cell;
472
473 min_x = *tree->minX;
474 max_x = *tree->maxX;
475#if DIM > 1
476 min_y = *tree->minY;
477 max_y = *tree->maxY;
478#if DIM == 3
479 min_z = *tree->minZ;
480 max_z = *tree->maxZ;
481#endif
482#endif
483 int k;
484 for (k = 0; k < j; k++) {
485
486 currentChild = path[k];
487
488 //if (currentChild % 2 != 0) {
489 if (currentChild & 1) {
490 max_x = 0.5 * (min_x + max_x);
491 //currentChild -= 1;
492 } else {
493 min_x = 0.5 * (min_x + max_x);
494 }
495#if DIM > 1
496 //if (currentChild % 2 == 0 && currentChild % 4 != 0) {
497 if ((currentChild >> 1) & 1) {
498 max_y = 0.5 * (min_y + max_y);
499 //currentChild -= 2;
500 } else {
501 min_y = 0.5 * (min_y + max_y);
502 }
503#if DIM == 3
504 //if (currentChild == 4) {
505 if ((currentChild >> 2) & 1) {
506 max_z = 0.5 * (min_z + max_z);
507 //currentChild -= 4;
508 } else {
509 min_z = 0.5 * (min_z + max_z);
510 }
511#endif
512#endif
513 }
514
515 //TODO: was particles->x[cell] = 0.5 * (min_x + max_x);
516 particles->x[cell] = particles->x[childIndex];
517 //particles->x[cell] = 0.5 * (min_x + max_x);
518#if DIM > 1
519 particles->y[cell] = particles->y[childIndex];
520 //particles->y[cell] = 0.5 * (min_y + max_y);
521#if DIM == 3
522 particles->z[cell] = particles->z[childIndex];
523 //particles->z[cell] = 0.5 * (min_z + max_z);
524#endif
525#endif
526 //particles->mass[cell] = particles->mass[childIndex];
527
528
529 // //particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
530 //particles->x[cell] = particles->x[childIndex];
531 //#if DIM > 1
532 // //particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
533 //particles->y[cell] = particles->y[childIndex];
534 //#if DIM == 3
535 // //particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
536 //particles->z[cell] = particles->z[childIndex];
537 //#endif
538 //#endif
539 // //particles->mass[cell] += particles->mass[childIndex];
540 //particles->mass[cell] = particles->mass[childIndex];
541
542 particles->level[cell] = particles->level[childIndex];
543 //particles->level[childIndex] += 1;
544 atomicAdd(&particles->level[childIndex], 1);
545
546 // insert old/original particle
547 childPath = 0; //(int) (domainListKeys[i] >> (21 * 3 - 3 * ((j+1) + 1)) & (int)7); //0; //currentChild; //0;
548 if (particles->x[childIndex] < 0.5 * (min_x + max_x)) {
549 childPath += 1;
550 //max_x = 0.5 * (min_x + max_x);
551 }
552 //else { min_x = 0.5 * (min_x + max_x); }
553#if DIM > 1
554 if (particles->y[childIndex] < 0.5 * (min_y + max_y)) {
555 childPath += 2;
556 //max_y = 0.5 * (min_y + max_y);
557 }
558 //else { min_y = 0.5 * (min_y + max_y); }
559#if DIM == 3
560 if (particles->z[childIndex] < 0.5 * (min_z + max_z)) {
561 childPath += 4;
562 //max_z = 0.5 * (min_z + max_z);
563 }
564 //else { min_z = 0.5 * (min_z + max_z); }
565#endif
566#endif
567
568
569
570#if DEBUGGING
571#if DIM == 3
572 printf("adding node in between for index %i, temp = %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i vs %i\n",
573 childIndex, temp, cell, childPath, j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
574 particles->level[cell], particles->level[childIndex]);
575#endif
576#endif
577
578
579 tree->child[POW_DIM * cell + childPath] = childIndex;
580 printf("child[8 * %i + %i] = %i\n", cell, childPath, childIndex);
581
582 //particles->nodeType[childIndex] = -10; // just some testing
583
584 childIndex = cell;
585 //domainListCounter = atomicAdd(domainList->domainListCounter, 1);
586 domainList->domainListIndices[index + offset] = childIndex; //temp;
587 particles->nodeType[childIndex] = 1;
588 }
589 } else {
590 // mark/save node as domain list node
591 //domainListCounter = atomicAdd(domainList->domainListCounter, 1);
592#if DEBUGGING
593 printf("[rank %i] Mark already available node %i: %i, path = %i (level = %i)\n",
594 subDomainKeyTree->rank, index + offset, childIndex, path[j-1], level);
595#endif
596 domainList->domainListIndices[index + offset] = childIndex; //temp;
597 particles->nodeType[childIndex] = 1;
598
599 }
600 __threadfence();
601 }
602 __syncthreads();
603
604 offset += stride;
605 }
606 }*/
607
608 // parallel version
609 __global__ void buildDomainTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
610 DomainList *domainList, integer n, integer m, integer level) {
611
612 integer index = threadIdx.x + blockIdx.x * blockDim.x;
613 integer stride = blockDim.x * gridDim.x;
614 integer offset = 0;
615
616 integer domainListCounter;
617
618 integer path[MAX_LEVEL];
619
620 real min_x, max_x;
621#if DIM > 1
622 real min_y, max_y;
623#if DIM == 3
624 real min_z, max_z;
625#endif
626#endif
627 integer currentChild;
628 integer childPath;
629 bool insert = true;
630
631 integer childIndex;
632 integer temp;
633 int j;
634
635 // loop over domain list indices (over the keys found/generated by createDomainListKernel)
636 while ((index + offset) < *domainList->domainListIndex) {
637
638 #pragma unroll
639 for (j = 0; j < MAX_LEVEL; j++) {
640 path[j] = 0;
641 }
642 if (domainList->domainListLevels[index + offset] == level) {
643 //printf("domainListKey[%i] = %lu\n", i, domainList->domainListKeys[i]);
644 childIndex = 0;
645 temp = 0;
646
647 min_x = *tree->minX;
648 max_x = *tree->maxX;
649#if DIM > 1
650 min_y = *tree->minY;
651 max_y = *tree->maxY;
652#if DIM == 3
653 min_z = *tree->minZ;
654 max_z = *tree->maxZ;
655#endif
656#endif
657
658 // iterate through levels (of corresponding domainListIndex)
659 for (j = 0; j < domainList->domainListLevels[index + offset]; j++) {
660 path[j] = (integer) (
661 domainList->domainListKeys[index + offset] >> (MAX_LEVEL * DIM - DIM * (j + 1)) &
662 (integer) (POW_DIM - 1));
663 temp = childIndex;
664 childIndex = tree->child[POW_DIM * childIndex + path[j]];
665 }
666
667 int k;
668 for (k = 0; k < j; k++) {
669
670 currentChild = path[k];
671 if (currentChild & 1) {
672 max_x = 0.5 * (min_x + max_x);
673 } else {
674 min_x = 0.5 * (min_x + max_x);
675 }
676#if DIM > 1
677 if ((currentChild >> 1) & 1) {
678 max_y = 0.5 * (min_y + max_y);
679 //path -= 2;
680 } else {
681 min_y = 0.5 * (min_y + max_y);
682 }
683#if DIM == 3
684 if ((currentChild >> 2) & 1) {
685 max_z = 0.5 * (min_z + max_z);
686 } else {
687 min_z = 0.5 * (min_z + max_z);
688 }
689#endif
690#endif
691 }
692
693 if (childIndex < n) {
694 if (childIndex == -1) {
695 // no child at all here, thus add node
696 //printf("build domain tree, adding node ...\n");
697 integer cell = atomicAdd(tree->index, 1);
698 tree->child[POW_DIM * temp + path[j-1]] = cell;
699 particles->level[cell] = j;
700 childIndex = cell;
701
702 domainList->domainListIndices[index + offset] = childIndex;
703 particles->nodeType[childIndex] = 1;
704
705 domainList->borders[(index + offset) * 2 * DIM] = min_x;
706 domainList->borders[(index + offset) * 2 * DIM + 1] = max_x;
707#if DIM > 1
708 domainList->borders[(index + offset) * 2 * DIM + 2] = min_y;
709 domainList->borders[(index + offset) * 2 * DIM + 3] = max_y;
710#if DIM == 3
711 domainList->borders[(index + offset) * 2 * DIM + 4] = min_z;
712 domainList->borders[(index + offset) * 2 * DIM + 5] = max_z;
713#endif
714#endif
715
716 //particles->x[cell] = particles->x[childIndex];
717 particles->x[cell] = 0.5 * (min_x + max_x);
718#if DIM > 1
719 //particles->y[cell] = particles->y[childIndex];
720 particles->y[cell] = 0.5 * (min_y + max_y);
721#if DIM == 3
722 //particles->z[cell] = particles->z[childIndex];
723 particles->z[cell] = 0.5 * (min_z + max_z);
724#endif
725#endif
726 // end: new
727#if DEBUGGING
728 #if DIM == 3
729 printf("adding node index %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i\n",
730 temp, cell, path[j], j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
731 particles->level[cell]);
732#endif
733#endif
734#if DIM == 3
735#if DEBUGGING
736 printf("[rank %i] adding domainListIndices[%i] = %i, childIndex = %i, path = %i\n", subDomainKeyTree->rank,
737 index + offset, domainList->domainListIndices[index + offset], childIndex, path[j-1]);
738 //printf("adding node index %i cell = %i (childPath = %i, j = %i)! x = (%f, %f, %f)\n",
739 // childIndex, cell, path[j], j, particles->x[childIndex], particles->y[childIndex],
740 // particles->z[childIndex]);
741#endif
742#endif
743 } else {
744 //printf("build domain tree, adding node in between...\n");
745 // child is a leaf, thus add node in between
746 integer cell = atomicAdd(tree->index, 1);
747 tree->child[POW_DIM * temp + path[j - 1]] = cell;
748
749 //TODO: was particles->x[cell] = 0.5 * (min_x + max_x);
750 particles->x[cell] = particles->x[childIndex];
751 //particles->x[cell] = 0.5 * (min_x + max_x);
752#if DIM > 1
753 particles->y[cell] = particles->y[childIndex];
754 //particles->y[cell] = 0.5 * (min_y + max_y);
755#if DIM == 3
756 particles->z[cell] = particles->z[childIndex];
757 //particles->z[cell] = 0.5 * (min_z + max_z);
758#endif
759#endif
760
761 domainList->borders[(index + offset) * 2 * DIM] = min_x;
762 domainList->borders[(index + offset) * 2 * DIM + 1] = max_x;
763#if DIM > 1
764 domainList->borders[(index + offset) * 2 * DIM + 2] = min_y;
765 domainList->borders[(index + offset) * 2 * DIM + 3] = max_y;
766#if DIM == 3
767 domainList->borders[(index + offset) * 2 * DIM + 4] = min_z;
768 domainList->borders[(index + offset) * 2 * DIM + 5] = max_z;
769#endif
770#endif
771
772 particles->level[cell] = particles->level[childIndex];
773 //particles->level[childIndex] += 1;
774 atomicAdd(&particles->level[childIndex], 1);
775
776 // insert old/original particle
777 childPath = 0; //(int) (domainListKeys[i] >> (21 * 3 - 3 * ((j+1) + 1)) & (int)7); //0; //currentChild; //0;
778 if (particles->x[childIndex] < 0.5 * (min_x + max_x)) {
779 childPath += 1;
780 //max_x = 0.5 * (min_x + max_x);
781 }
782 //else { min_x = 0.5 * (min_x + max_x); }
783#if DIM > 1
784 if (particles->y[childIndex] < 0.5 * (min_y + max_y)) {
785 childPath += 2;
786 //max_y = 0.5 * (min_y + max_y);
787 }
788 //else { min_y = 0.5 * (min_y + max_y); }
789#if DIM == 3
790 if (particles->z[childIndex] < 0.5 * (min_z + max_z)) {
791 childPath += 4;
792 //max_z = 0.5 * (min_z + max_z);
793 }
794 //else { min_z = 0.5 * (min_z + max_z); }
795#endif
796#endif
797
798
799
800#if DEBUGGING
801 #if DIM == 3
802 printf("adding node in between for index %i, temp = %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i vs %i\n",
803 childIndex, temp, cell, childPath, j, particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
804 particles->level[cell], particles->level[childIndex]);
805#endif
806#endif
807
808
809 tree->child[POW_DIM * cell + childPath] = childIndex;
810 //printf("child[8 * %i + %i] = %i\n", cell, childPath, childIndex);
811
812 //particles->nodeType[childIndex] = -10; // just some testing
813
814 childIndex = cell;
815 //domainListCounter = atomicAdd(domainList->domainListCounter, 1);
816 domainList->domainListIndices[index + offset] = childIndex; //temp;
817 particles->nodeType[childIndex] = 1;
818 }
819 } else {
820 // mark/save node as domain list node
821 //domainListCounter = atomicAdd(domainList->domainListCounter, 1);
822#if DEBUGGING
823 printf("[rank %i] Mark already available node %i: %i, path = %i (level = %i)\n",
824 subDomainKeyTree->rank, index + offset, childIndex, path[j-1], level);
825#endif
826
827 domainList->borders[(index + offset) * 2 * DIM] = min_x;
828 domainList->borders[(index + offset) * 2 * DIM + 1] = max_x;
829#if DIM > 1
830 domainList->borders[(index + offset) * 2 * DIM + 2] = min_y;
831 domainList->borders[(index + offset) * 2 * DIM + 3] = max_y;
832#if DIM == 3
833 domainList->borders[(index + offset) * 2 * DIM + 4] = min_z;
834 domainList->borders[(index + offset) * 2 * DIM + 5] = max_z;
835#endif
836#endif
837 domainList->domainListIndices[index + offset] = childIndex; //temp;
838 particles->nodeType[childIndex] = 1;
839
840 }
841 __threadfence();
842 }
843 __syncthreads();
844
845 offset += stride;
846 }
847 }
848
849 __global__ void getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree,
850 Particles *particles, keyType *keys, integer maxLevel,
851 integer n, Curve::Type curveType) {
852
853 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
854 integer stride = blockDim.x * gridDim.x;
855 integer offset = 0;
856
857 keyType particleKey;
858 keyType hilbertParticleKey;
859
860 char keyAsChar[21 * 2 + 3];
861 integer proc;
862
863 while (bodyIndex + offset < n) {
864
865 //particleKey = 0UL;
866 particleKey = tree->getParticleKey(particles, bodyIndex + offset, maxLevel, curveType);
867
868 // DEBUG
869 //KeyNS::key2Char(particleKey, 21, keyAsChar);
870 //printf("keyMax: %lu = %s\n", particleKey, keyAsChar);
871 //proc = subDomainKeyTree->key2proc(particleKey);
872 //if (proc == 0) {
873 // atomicAdd(tree->index, 1);
874 //}
875 //if ((bodyIndex + offset) % 1000 == 0) {
876 // printf("[rank %i] proc = %i, particleKey = %s = %lu\n", subDomainKeyTree->rank, proc,
877 // keyAsChar, particleKey);
878 //printf("[rank %i] particleKey = %lu, proc = %i\n", subDomainKeyTree->rank, particleKey,
879 // proc);
880 //}
881 //if (subDomainKeyTree->rank != proc) {
882 // printf("[rank %i] particleKey = %lu, proc = %i\n", subDomainKeyTree->rank, particleKey,
883 // proc);
884 //}
885
886 keys[bodyIndex + offset] = particleKey; //hilbertParticleKey;
887
888 //if ((bodyIndex + offset) % 100 == 0) {
889 // KeyNS::key2Char(particleKey, 21, keyAsChar);
890 // printf("key = %s = %lu\n", keyAsChar, particleKey);
891 //}
892
893 offset += stride;
894 }
895 }
896
897 __global__ void particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
898 integer n, integer m, Curve::Type curveType) {
899
900 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
901 integer stride = blockDim.x * gridDim.x;
902 integer offset = 0;
903
904 keyType key;
905 integer proc;
906
907 while ((bodyIndex + offset) < n) {
908
909 // calculate particle key from particle's position
910 key = tree->getParticleKey(particles, bodyIndex + offset, MAX_LEVEL, curveType);
911 // get corresponding process
912 proc = subDomainKeyTree->key2proc(key);
913 // increment corresponding counter
914 atomicAdd(&subDomainKeyTree->procParticleCounter[proc], 1);
915
916 offset += stride;
917 }
918
919 }
920
921 __global__ void markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
922 integer n, integer m, integer *sortArray, Curve::Type curveType) {
923
924 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
925 integer stride = blockDim.x * gridDim.x;
926 integer offset = 0;
927
928 keyType key;
929 integer proc;
930 integer counter;
931
932 while ((bodyIndex + offset) < n) {
933
934 // calculate particle key from particle's position
935 key = tree->getParticleKey(particles, bodyIndex + offset, MAX_LEVEL, curveType);
936 // get corresponding process
937 proc = subDomainKeyTree->key2proc(key);
938 // mark particle with corresponding process
939 sortArray[bodyIndex + offset] = proc;
940
941 offset += stride;
942
943 }
944 }
945
946 __global__ void zeroDomainListNodes(Particles *particles, DomainList *domainList,
947 DomainList *lowestDomainList) {
948
949 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
950 integer stride = blockDim.x*gridDim.x;
951 integer offset = 0;
952 integer domainIndex;
953 bool zero;
954
955 while ((bodyIndex + offset) < *domainList->domainListIndex) {
956 zero = true;
957 domainIndex = domainList->domainListIndices[bodyIndex + offset];
958 //for (int i=0; i<*lowestDomainList->domainListIndex; i++) {
959 // if (domainIndex == lowestDomainList->domainListIndices[i]) {
960 // zero = false;
961 // break;
962 // }
963 //}
964
965 if (particles->nodeType[domainIndex] == 2) {
966 zero = false;
967 }
968
969 if (zero) {
970 particles->x[domainIndex] = (real)0;
971#if DIM > 1
972 particles->y[domainIndex] = (real)0;
973#if DIM == 3
974 particles->z[domainIndex] = (real)0;
975#endif
976#endif
977 particles->mass[domainIndex] = (real)0;
978 }
979 else {
980 // //printf("domainIndex = %i *= mass = %f\n", domainIndex, particles->mass[domainIndex]);
981 particles->x[domainIndex] *= particles->mass[domainIndex];
982#if DIM > 1
983 particles->y[domainIndex] *= particles->mass[domainIndex];
984#if DIM == 3
985 particles->z[domainIndex] *= particles->mass[domainIndex];
986#endif
987#endif
988 }
989
990 offset += stride;
991 }
992 }
993
994 template <typename T>
995 __global__ void prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList,
996 T *buffer, Entry::Name entry) {
997
998 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
999 integer stride = blockDim.x*gridDim.x;
1000 integer offset = 0;
1001 integer lowestDomainIndex;
1002
1003 //copy x, y, z, mass of lowest domain list nodes into arrays
1004 //sorting using cub (not here)
1005 while ((bodyIndex + offset) < *lowestDomainList->domainListIndex) {
1006 lowestDomainIndex = lowestDomainList->domainListIndices[bodyIndex + offset];
1007 if (lowestDomainIndex >= 0) {
1008 switch (entry) {
1009 case Entry::x: {
1010 buffer[bodyIndex + offset] = particles->x[lowestDomainIndex];
1011 } break;
1012#if DIM > 1
1013 case Entry::y: {
1014 buffer[bodyIndex + offset] = particles->y[lowestDomainIndex];
1015 } break;
1016#if DIM == 3
1017 case Entry::z: {
1018 buffer[bodyIndex + offset] = particles->z[lowestDomainIndex];
1019 } break;
1020#endif
1021#endif
1022 case Entry::mass: {
1023 buffer[bodyIndex + offset] = particles->mass[lowestDomainIndex];
1024 } break;
1025 default:
1026 printf("prepareLowestDomainExchange(): Not available!\n");
1027 }
1028 }
1029 offset += stride;
1030 }
1031 }
1032
1033 template <typename T>
1034 __global__ void updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList,
1035 T *buffer, Entry::Name entry) {
1036
1037 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1038 integer stride = blockDim.x * gridDim.x;
1039 integer offset = 0;
1040
1041 integer originalIndex = -1;
1042
1043 while ((bodyIndex + offset) < *lowestDomainList->domainListIndex) {
1044 originalIndex = -1;
1045 for (int i = 0; i < *lowestDomainList->domainListIndex; i++) {
1046 if (lowestDomainList->sortedDomainListKeys[bodyIndex + offset] ==
1047 lowestDomainList->domainListKeys[i]) {
1048 originalIndex = i;
1049 }
1050 }
1051
1052 if (originalIndex == -1) {
1053 cudaTerminate("ATTENTION: originalIndex = -1 (index = %i)!\n",
1054 lowestDomainList->sortedDomainListKeys[bodyIndex + offset]);
1055 }
1056
1057 switch (entry) {
1058 case Entry::x: {
1059 particles->x[lowestDomainList->domainListIndices[originalIndex]] =
1060 buffer[bodyIndex + offset];
1061 } break;
1062#if DIM > 1
1063 case Entry::y: {
1064 particles->y[lowestDomainList->domainListIndices[originalIndex]] =
1065 buffer[bodyIndex + offset];
1066 } break;
1067#if DIM == 3
1068 case Entry::z: {
1069 particles->z[lowestDomainList->domainListIndices[originalIndex]] =
1070 buffer[bodyIndex + offset];
1071 } break;
1072#endif
1073#endif
1074 case Entry::mass: {
1075 particles->mass[lowestDomainList->domainListIndices[originalIndex]] =
1076 buffer[bodyIndex + offset];
1077 } break;
1078 default: {
1079 printf("Entry not available!\n");
1080 }
1081 }
1082
1083 offset += stride;
1084 }
1085 }
1086
1087 __global__ void compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList) {
1088
1089 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1090 integer stride = blockDim.x*gridDim.x;
1091 integer offset = 0;
1092 integer lowestDomainIndex;
1093 bool divide;
1094
1095 while ((bodyIndex + offset) < *lowestDomainList->domainListIndex) {
1096
1097 //divide = false;
1098 lowestDomainIndex = lowestDomainList->domainListIndices[bodyIndex + offset];
1099
1100 //for (int child=0; child<POW_DIM; child++) {
1101 // if (tree->child[POW_DIM * lowestDomainIndex + child] != -1) {
1102 // printf("lowestDomainIndex: tree->child[8 * %i + %i] = %i\n", lowestDomainIndex, child,
1103 // tree->child[POW_DIM * lowestDomainIndex + child]);
1104 // divide = true;
1105 // break;
1106 // }
1107 //}
1108
1109 //if (particles->mass[lowestDomainIndex] != (real)0) {
1110 //if (particles->mass[lowestDomainIndex] > (real)0) {
1111 if (/*divide && */particles->mass[lowestDomainIndex] > (real)0) {
1112
1113#if DIM == 3
1114 //printf("lowestDomainIndex: %i (%f, %f, %f) %f\n", lowestDomainIndex, particles->x[lowestDomainIndex],
1115 // particles->y[lowestDomainIndex], particles->z[lowestDomainIndex], particles->mass[lowestDomainIndex]);
1116#endif
1117
1118 particles->x[lowestDomainIndex] /= particles->mass[lowestDomainIndex];
1119#if DIM > 1
1120 particles->y[lowestDomainIndex] /= particles->mass[lowestDomainIndex];
1121#if DIM == 3
1122 particles->z[lowestDomainIndex] /= particles->mass[lowestDomainIndex];
1123#endif
1124#endif
1125
1126#if DIM == 3
1127 //printf("lowestDomainIndex: %i (%f, %f, %f) %f\n", lowestDomainIndex, particles->x[lowestDomainIndex],
1128 // particles->y[lowestDomainIndex], particles->z[lowestDomainIndex], particles->mass[lowestDomainIndex]);
1129#endif
1130 }
1131
1132 //printf("lowestDomainIndex = %i (%f, %f, %f) %f\n", lowestDomainIndex, particles->x[lowestDomainIndex],
1133 // particles->y[lowestDomainIndex], particles->z[lowestDomainIndex], particles->mass[lowestDomainIndex]);
1134
1135 offset += stride;
1136 }
1137 }
1138
1139 __global__ void compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n) {
1140
1141 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1142 integer stride = blockDim.x*gridDim.x;
1143 integer offset = 0;
1144 bool isDomainList;
1145
1146 bodyIndex += n;
1147
1148 while (bodyIndex + offset < *tree->index) {
1149 isDomainList = false;
1150
1151 //for (integer i=0; i<*domainList->domainListIndex; i++) {
1152 // if ((bodyIndex + offset) == domainList->domainListIndices[i]) {
1153 // isDomainList = true; // hence do not insert
1154 // break;
1155 // }
1156 //}
1157 if (particles->nodeType[bodyIndex + offset] >= 1) {
1158 isDomainList = true;
1159 }
1160
1161 if (/*particles->mass[bodyIndex + offset] != 0 && */!isDomainList) {
1162 particles->x[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1163#if DIM > 1
1164 particles->y[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1165#if DIM == 3
1166 particles->z[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1167#endif
1168#endif
1169 }
1170
1171 offset += stride;
1172 }
1173 }
1174
1175 __global__ void compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList,
1176 DomainList *lowestDomainList, int n, int level) {
1177
1178 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1179 integer stride = blockDim.x*gridDim.x;
1180 integer offset;
1181
1182 integer domainIndex;
1183 //integer level = MAX_LEVEL; // max level
1184 bool compute;
1185
1186 real totalMass, x_mass, y_mass, z_mass;
1187 int childToLookAt = 5;
1188 int lowestDomainIndex, childLowestDomain;
1189
1190 offset = 0;
1191 compute = true;
1192 while ((bodyIndex + offset) < *domainList->domainListIndex) {
1193 compute = true;
1194 domainIndex = domainList->domainListIndices[bodyIndex + offset];
1195 if (particles->nodeType[domainIndex] == 2) {
1196 compute = false;
1197 }
1198 //for (int i=0; i<*lowestDomainList->domainListIndex; i++) {
1199 // if (domainIndex == lowestDomainList->domainListIndices[i]) {
1200 // compute = false;
1201 // /*lowestDomainIndex = domainIndex;
1202 // while (lowestDomainIndex != -1) {
1203 // //childLowestDomain;
1204 // printf("(%lu) lowestDomainIndex: %i (%e, %e, %e) %e\n", lowestDomainList->domainListKeys[i],
1205 // lowestDomainIndex,
1206 // particles->x[lowestDomainIndex], particles->y[lowestDomainIndex],
1207 // particles->z[lowestDomainIndex], particles->mass[lowestDomainIndex]);
1208 // totalMass = 0.;
1209 // x_mass = 0.;
1210 // y_mass = 0.;
1211 // z_mass = 0.;
1212 // for (int i_child = 0; i_child < POW_DIM; i_child++) {
1213 // childLowestDomain = tree->child[POW_DIM * lowestDomainIndex + i_child];
1214 // if (childLowestDomain != -1) {
1215 // //printf("totalMass += %e\n", particles->mass[childLowestDomain]);
1216 // totalMass += particles->mass[childLowestDomain];
1217 // x_mass += particles->mass[childLowestDomain] * particles->x[childLowestDomain];
1218 // y_mass += particles->mass[childLowestDomain] * particles->y[childLowestDomain];
1219 // z_mass += particles->mass[childLowestDomain] * particles->z[childLowestDomain];
1220 // //if (totalMass > 0.) {
1221 // // printf("totalMass = %e > %e\n", totalMass, particles->mass[lowestDomainIndex]);
1222 // //}
1223 // }
1224 // //if (particles->mass[childLowestDomain] > 0.) {
1225 // // printf("!= 0: %e\n", particles->mass[childLowestDomain]);
1226 // //}
1227 // printf("(%lu) lowestDomainIndex: %i child #%i -> %i (%e, %e, %e) %e\n",
1228 // lowestDomainList->domainListKeys[i],
1229 // lowestDomainIndex, i_child, childLowestDomain,
1230 // particles->x[childLowestDomain], particles->y[childLowestDomain],
1231 // particles->z[childLowestDomain], particles->mass[childLowestDomain]);
1232 // }
1233 // x_mass /= totalMass;
1234 // y_mass /= totalMass;
1235 // z_mass /= totalMass;
1236 // if (totalMass > (particles->mass[lowestDomainIndex] + 1e-7)) { //||
1237 // //x_mass < (particles->x[lowestDomainIndex] - 1e-7) ||
1238 // //y_mass < (particles->y[lowestDomainIndex] - 1e-7) ||
1239 // //z_mass < (particles->z[lowestDomainIndex] - 1e-7)) {
1240 // //if (totalMass > 0.) {
1241 // printf("totalMass = %e > %e\n", totalMass, particles->mass[lowestDomainIndex]);
1242 // //}
1243 // assert(0);
1244 // }
1245 // lowestDomainIndex = tree->child[POW_DIM * lowestDomainIndex + childToLookAt];
1246 // }
1247 // break;
1248 // }
1249 //}
1250 if (compute && domainList->domainListLevels[bodyIndex + offset] == level) {
1251 // do the calculation
1252 /*
1253 particles->x[domainIndex] += 0.;
1254#if DIM > 1
1255 particles->y[domainIndex] += 0.;
1256#if DIM == 3
1257 particles->z[domainIndex] += 0.;
1258#endif
1259#endif
1260 particles->mass[domainIndex] += 0.;
1261 */
1262 for (int i=0; i<POW_DIM; i++) {
1263 particles->x[domainIndex] += particles->x[tree->child[POW_DIM*domainIndex + i]] *
1264 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1265#if DIM > 1
1266 particles->y[domainIndex] += particles->y[tree->child[POW_DIM*domainIndex + i]] *
1267 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1268#if DIM == 3
1269 particles->z[domainIndex] += particles->z[tree->child[POW_DIM*domainIndex + i]] *
1270 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1271#endif
1272#endif
1273 particles->mass[domainIndex] += particles->mass[tree->child[POW_DIM*domainIndex + i]];
1274 }
1275
1276 if (particles->mass[domainIndex] > 0.) {
1277 particles->x[domainIndex] /= particles->mass[domainIndex];
1278#if DIM > 1
1279 particles->y[domainIndex] /= particles->mass[domainIndex];
1280#if DIM == 3
1281 particles->z[domainIndex] /= particles->mass[domainIndex];
1282#endif
1283#endif
1284 }
1285 }
1286 offset += stride;
1287 }
1288 __syncthreads();
1289 }
1290
1291 __global__ void compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList,
1292 DomainList *lowestDomainList, int n) {
1293 //calculate position (center of mass) and mass for domain list nodes
1294 //Problem: start with "deepest" nodes
1295 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1296 integer stride = blockDim.x*gridDim.x;
1297 integer offset;
1298
1299 integer domainIndex;
1300 integer level = MAX_LEVEL; // max level
1301 bool compute;
1302
1303 // go from max level to level=0
1304 while (level >= 0) {
1305 offset = 0;
1306 compute = true;
1307 while ((bodyIndex + offset) < *domainList->domainListIndex) {
1308 compute = true;
1309 domainIndex = domainList->domainListIndices[bodyIndex + offset];
1310 if (particles->nodeType[domainIndex] == 2) {
1311 compute = false;
1312 }
1313 //for (int i=0; i<*lowestDomainList->domainListIndex; i++) {
1314 // if (domainIndex == lowestDomainList->domainListIndices[i]) {
1315 // compute = false;
1316 // }
1317 //}
1318 if (compute && domainList->domainListLevels[bodyIndex + offset] == level) {
1319 // do the calculation
1320 for (int i=0; i<POW_DIM; i++) {
1321 particles->x[domainIndex] += particles->x[tree->child[POW_DIM*domainIndex + i]] *
1322 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1323#if DIM > 1
1324 particles->y[domainIndex] += particles->y[tree->child[POW_DIM*domainIndex + i]] *
1325 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1326#if DIM == 3
1327 particles->z[domainIndex] += particles->z[tree->child[POW_DIM*domainIndex + i]] *
1328 particles->mass[tree->child[POW_DIM*domainIndex + i]];
1329#endif
1330#endif
1331 particles->mass[domainIndex] += particles->mass[tree->child[POW_DIM*domainIndex + i]];
1332 }
1333
1334 if (particles->mass[domainIndex] != 0.f) {
1335 particles->x[domainIndex] /= particles->mass[domainIndex];
1336#if DIM > 1
1337 particles->y[domainIndex] /= particles->mass[domainIndex];
1338#if DIM == 3
1339 particles->z[domainIndex] /= particles->mass[domainIndex];
1340#endif
1341#endif
1342 }
1343 }
1344 offset += stride;
1345 }
1346 __syncthreads();
1347 level--;
1348 }
1349 }
1350
1351 // delete the children of (lowest) domain list nodes if
1352 // - toDeleteLeaf[0] < child < numParticles
1353 // - child > toDeleteNode[0]
1354 // since problem for predictor-corrector decoupled gravity (having particles belonging to another process)
1355 __global__ void repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1356 DomainList *domainList, DomainList *lowestDomainList,
1357 int n, int m, Curve::Type curveType) {
1358
1359 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1360 integer stride = blockDim.x * gridDim.x;
1361 integer offset = 0;
1362
1363 keyType key;
1364 int domainIndex;
1365 int proc;
1366
1367 if (bodyIndex + offset == 0) {
1368 *tree->index = tree->toDeleteNode[0];
1369 }
1370
1371 while ((bodyIndex + offset) < *domainList->domainListIndex) {
1372 domainIndex = domainList->domainListIndices[bodyIndex + offset];
1373 for (int i=0; i<POW_DIM; i++) {
1374 if ((tree->child[POW_DIM * domainIndex + i] >= tree->toDeleteNode[0]
1375 || (tree->child[POW_DIM * domainIndex + i] >= tree->toDeleteLeaf[0] &&
1376 tree->child[POW_DIM * domainIndex + i] < n))
1377 && particles->nodeType[tree->child[POW_DIM * domainIndex + i]] < 1) {
1378
1379 tree->child[POW_DIM * domainIndex + i] = -1;
1380
1381 }
1382 }
1383 offset += stride;
1384 }
1385 // alternatively:
1386 //while ((bodyIndex + offset) < *lowestDomainList->domainListIndex) {
1387 // domainIndex = lowestDomainList->domainListIndices[bodyIndex + offset];
1388 // //key = tree->getParticleKey(particles, domainIndex, MAX_LEVEL, curveType); // working version
1389 // //proc = subDomainKeyTree->key2proc(key);
1390 // // //printf("[rank %i] deleting: proc = %i\n", subDomainKeyTree->rank, proc);
1391 // //if (proc != subDomainKeyTree->rank) {
1392 // // for (int i=0; i<POW_DIM; i++) {
1393 // // //printf("[rank %i] deleting: POWDIM * %i + %i = %i\n", subDomainKeyTree->rank, domainIndex, i, tree->child[POW_DIM * domainIndex + i]);
1394 // // tree->child[POW_DIM * domainIndex + i] = -1;
1395 // // }
1396 // //}
1397 // offset += stride;
1398 //}
1399
1400 offset = tree->toDeleteLeaf[0];
1401 //delete inserted leaves
1402 while ((bodyIndex + offset) >= tree->toDeleteLeaf[0] && (bodyIndex + offset) < tree->toDeleteLeaf[1]) {
1403 for (int i=0; i<POW_DIM; i++) {
1404 tree->child[(bodyIndex + offset)*POW_DIM + i] = -1;
1405 }
1406 tree->count[bodyIndex + offset] = 1;
1407
1408 particles->x[bodyIndex + offset] = 0.;
1409 particles->vx[bodyIndex + offset] = 0.;
1410 particles->ax[bodyIndex + offset] = 0.;
1411 particles->g_ax[bodyIndex + offset] = 0.;
1412#if DIM > 1
1413 particles->y[bodyIndex + offset] = 0.;
1414 particles->vy[bodyIndex + offset] = 0.;
1415 particles->ay[bodyIndex + offset] = 0.;
1416 particles->g_ay[bodyIndex + offset] = 0.;
1417#if DIM == 3
1418 particles->z[bodyIndex + offset] = 0.;
1419 particles->vz[bodyIndex + offset] = 0.;
1420 particles->az[bodyIndex + offset] = 0.;
1421 particles->g_az[bodyIndex + offset] = 0.;
1422#endif
1423#endif
1424 particles->mass[bodyIndex + offset] = 0.;
1425 tree->start[bodyIndex + offset] = -1;
1426 tree->sorted[bodyIndex + offset] = 0;
1427
1428 offset += stride;
1429 }
1430
1431 offset = tree->toDeleteNode[0];
1432 //delete inserted cells
1433 while ((bodyIndex + offset) >= tree->toDeleteNode[0] && (bodyIndex + offset) < tree->toDeleteNode[1]) {
1434 for (int i=0; i<POW_DIM; i++) {
1435 tree->child[(bodyIndex + offset)*POW_DIM + i] = -1;
1436 }
1437 tree->count[bodyIndex + offset] = 0;
1438 particles->x[bodyIndex + offset] = 0.;
1439 //particles->vx[bodyIndex + offset] = 0.;
1440 //particles->ax[bodyIndex + offset] = 0.;
1441#if DIM > 1
1442 particles->y[bodyIndex + offset] = 0.;
1443 //particles->vy[bodyIndex + offset] = 0.;
1444 //particles->ay[bodyIndex + offset] = 0.;
1445#if DIM == 3
1446 particles->z[bodyIndex + offset] = 0.;
1447 //particles->vz[bodyIndex + offset] = 0.;
1448 //particles->az[bodyIndex + offset] = 0.;
1449#endif
1450#endif
1451 particles->mass[bodyIndex + offset] = 0.;
1452 //tree->start[bodyIndex + offset] = -1;
1453 //tree->sorted[bodyIndex + offset] = 0;
1454
1455 offset += stride;
1456 }
1457 }
1458
1459 __global__ void createKeyHistRanges(Helper *helper, integer bins) {
1460
1461 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1462 integer stride = blockDim.x * gridDim.x;
1463 integer offset = 0;
1464
1465 keyType max_key = 1UL << (DIM * 21);//1UL << 63;
1466
1467 while ((bodyIndex + offset) < bins) {
1468
1469 helper->keyTypeBuffer[bodyIndex + offset] = (bodyIndex + offset) * (max_key/bins);
1470 //printf("keyHistRanges[%i] = %lu\n", bodyIndex + offset, keyHistRanges[bodyIndex + offset]);
1471
1472 if ((bodyIndex + offset) == (bins - 1)) {
1473 helper->keyTypeBuffer[bins-1] = KEY_MAX;
1474 }
1475 offset += stride;
1476 }
1477 }
1478
1479 __global__ void keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree,
1480 Helper *helper, /*keyType *keyHistRanges, integer *keyHistCounts,*/
1481 int bins, int n, Curve::Type curveType) {
1482
1483 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1484 integer stride = blockDim.x * gridDim.x;
1485 integer offset = 0;
1486
1487 keyType key;
1488
1489 while ((bodyIndex + offset) < n) {
1490
1491 key = tree->getParticleKey(particles, bodyIndex + offset, MAX_LEVEL, curveType);
1492
1493 for (int i = 0; i < (bins); i++) {
1494 if (key >= helper->keyTypeBuffer[i] && key < helper->keyTypeBuffer[i + 1]) {
1495 //keyHistCounts[i] += 1;
1496 atomicAdd(&helper->integerBuffer[i], 1);
1497 break;
1498 }
1499 }
1500
1501 offset += stride;
1502 }
1503
1504 }
1505
1506 //TODO: resetting helper (buffers)?!
1507 __global__ void calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper,
1508 /*keyType *keyHistRanges, integer *keyHistCounts,*/
1509 int bins, int n, Curve::Type curveType) {
1510
1511 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1512 integer stride = blockDim.x * gridDim.x;
1513 integer offset = 0;
1514
1515 integer sum;
1516 keyType newRange;
1517
1518 while ((bodyIndex + offset) < (bins-1)) {
1519
1520 sum = 0;
1521 for (integer i=0; i<(bodyIndex+offset); i++) {
1522 sum += helper->integerBuffer[i];
1523 }
1524
1525 for (integer i=1; i<subDomainKeyTree->numProcesses; i++) {
1526 if ((sum + helper->integerBuffer[bodyIndex + offset]) >= (i*n) && sum < (i*n)) {
1527
1528 subDomainKeyTree->range[i] = (helper->keyTypeBuffer[bodyIndex + offset] >> (1*DIM)) << (1*DIM);
1529 }
1530 }
1531 //printf("[rank %i] keyHistCounts[%i] = %i\n", s->rank, bodyIndex+offset, keyHistCounts[bodyIndex+offset]);
1532 atomicAdd(helper->integerVal, helper->integerBuffer[bodyIndex+offset]);
1533 offset += stride;
1534 }
1535
1536 }
1537
1538 void Launch::set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range,
1539 integer *procParticleCounter) {
1540 ExecutionPolicy executionPolicy(1, 1);
1541 cuda::launch(false, executionPolicy, ::SubDomainKeyTreeNS::Kernel::set, subDomainKeyTree, rank,
1542 numProcesses, range, procParticleCounter);
1543 }
1544
1545 void Launch::test(SubDomainKeyTree *subDomainKeyTree) {
1546 ExecutionPolicy executionPolicy(1, 1);
1547 cuda::launch(false, executionPolicy, ::SubDomainKeyTreeNS::Kernel::test, subDomainKeyTree);
1548 }
1549
1550 real Launch::buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m) {
1551 // is there any possibility to call kernel with more than one thread?: YES see corresponding function below
1552 ExecutionPolicy executionPolicy(1, 1);
1553 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::buildDomainTree, tree, particles,
1554 domainList, n, m);
1555 }
1556
1557 real Launch::buildDomainTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m, integer level) {
1558 ExecutionPolicy executionPolicy;
1559 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::buildDomainTree,
1560 subDomainKeyTree, tree, particles,
1561 domainList, n, m, level);
1562 }
1563
1564 real Launch::getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree,
1565 Particles *particles, keyType *keys, integer maxLevel,
1566 integer n, Curve::Type curveType) {
1567 ExecutionPolicy executionPolicy;
1568 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::getParticleKeys, subDomainKeyTree,
1569 tree, particles, keys, maxLevel, n, curveType);
1570 }
1571
1572 real Launch::particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1573 integer n, integer m, Curve::Type curveType) {
1574 ExecutionPolicy executionPolicy;
1575 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::particlesPerProcess,
1576 subDomainKeyTree, tree, particles, n, m, curveType);
1577 }
1578
1579 real Launch::markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1580 integer n, integer m, integer *sortArray,
1581 Curve::Type curveType) {
1582 ExecutionPolicy executionPolicy;
1583 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::markParticlesProcess,
1584 subDomainKeyTree, tree, particles, n, m, sortArray, curveType);
1585 }
1586
1587 real Launch::zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList) {
1588 ExecutionPolicy executionPolicy;
1589 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::zeroDomainListNodes, particles, domainList,
1590 lowestDomainList);
1591 }
1592
1593 template <typename T>
1594 real Launch::prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList,
1595 T *buffer, Entry::Name entry) {
1596 ExecutionPolicy executionPolicy;
1597 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::prepareLowestDomainExchange, particles,
1598 lowestDomainList, buffer, entry);
1599 }
1600
1601 template real Launch::prepareLowestDomainExchange<real>(Particles *particles, DomainList *lowestDomainList,
1602 real *buffer, Entry::Name entry);
1603
1604 template <typename T>
1605 real Launch::updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList,
1606 T *buffer, Entry::Name entry) {
1607 ExecutionPolicy executionPolicy;
1608 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::updateLowestDomainListNodes, particles,
1609 lowestDomainList, buffer, entry);
1610
1611 }
1612
1613 template real Launch::updateLowestDomainListNodes<real>(Particles *particles, DomainList *lowestDomainList,
1614 real *buffer, Entry::Name entry);
1615
1616 real Launch::compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList) {
1617 ExecutionPolicy executionPolicy;
1618 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::compLowestDomainListNodes, tree, particles,
1619 lowestDomainList);
1620 }
1621
1622 real Launch::compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n) {
1623 ExecutionPolicy executionPolicy;
1624 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::compLocalPseudoParticles, tree, particles,
1625 domainList, n);
1626 }
1627
1628 real Launch::compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList,
1629 DomainList *lowestDomainList, int n, int level) {
1630 ExecutionPolicy executionPolicy(256, 1);
1631 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticlesPerLevel, tree,
1632 particles, domainList, lowestDomainList, n, level);
1633 }
1634
1635 real Launch::compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList,
1636 DomainList *lowestDomainList, int n) {
1637 ExecutionPolicy executionPolicy(256, 1);
1638 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticles, tree,
1639 particles, domainList, lowestDomainList, n);
1640 }
1641
1642 real Launch::repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1643 DomainList *domainList, DomainList *lowestDomainList,
1644 int n, int m, Curve::Type curveType) {
1645 ExecutionPolicy executionPolicy;
1646 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::repairTree, subDomainKeyTree, tree,
1647 particles, domainList, lowestDomainList, n, m, curveType);
1648 }
1649
1650 real Launch::createKeyHistRanges(Helper *helper, integer bins) {
1651 ExecutionPolicy executionPolicy;
1652 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::createKeyHistRanges, helper, bins);
1653 }
1654
1655 real Launch::keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree,
1656 Helper *helper, int bins, int n, Curve::Type curveType) {
1657 ExecutionPolicy executionPolicy;
1658 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::keyHistCounter, tree, particles,
1659 subDomainKeyTree, helper, bins, n, curveType);
1660 }
1661
1662 real Launch::calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n,
1663 Curve::Type curveType) {
1664 ExecutionPolicy executionPolicy;
1665 return cuda::launch(true, executionPolicy, ::SubDomainKeyTreeNS::Kernel::calculateNewRange, subDomainKeyTree, helper,
1666 bins, n, curveType);
1667 }
1668
1669
1670 }
1671
1672}
1673
1674CUDA_CALLABLE_MEMBER DomainList::DomainList() {
1675
1676}
1677
1678CUDA_CALLABLE_MEMBER DomainList::DomainList(integer *domainListIndices, integer *domainListLevels,
1679 integer *domainListIndex, integer *domainListCounter,
1680 keyType *domainListKeys, keyType *sortedDomainListKeys,
1681 integer *relevantDomainListIndices, integer *relevantDomainListLevels,
1682 integer *relevantDomainListProcess) :
1683 domainListIndices(domainListIndices), domainListLevels(domainListLevels),
1684 domainListIndex(domainListIndex), domainListCounter(domainListCounter),
1685 domainListKeys(domainListKeys), sortedDomainListKeys(sortedDomainListKeys),
1686 relevantDomainListIndices(relevantDomainListIndices),
1687 relevantDomainListLevels(relevantDomainListLevels),
1688 relevantDomainListProcess(relevantDomainListProcess) {
1689
1690}
1691
1692CUDA_CALLABLE_MEMBER DomainList::~DomainList() {
1693
1694}
1695
1696CUDA_CALLABLE_MEMBER void DomainList::set(integer *domainListIndices, integer *domainListLevels,
1697 integer *domainListIndex, integer *domainListCounter, keyType *domainListKeys,
1698 keyType *sortedDomainListKeys, integer *relevantDomainListIndices,
1699 integer *relevantDomainListLevels, integer *relevantDomainListProcess) {
1700
1701 this->domainListIndices = domainListIndices;
1702 this->domainListLevels = domainListLevels;
1703 this->domainListIndex = domainListIndex;
1704 this->domainListCounter = domainListCounter;
1705 this->domainListKeys = domainListKeys;
1706 this->sortedDomainListKeys = sortedDomainListKeys;
1707 this->relevantDomainListIndices = relevantDomainListIndices;
1708 this->relevantDomainListLevels = relevantDomainListLevels;
1709 this->relevantDomainListProcess = relevantDomainListProcess;
1710
1711 *domainListIndex = 0;
1712}
1713
1714CUDA_CALLABLE_MEMBER void DomainList::setBorders(real *borders, integer *relevantDomainListOriginalIndex) {
1715 this->borders = borders;
1716 this->relevantDomainListOriginalIndex = relevantDomainListOriginalIndex;
1717}
1718
1719namespace DomainListNS {
1720
1721 namespace Kernel {
1722
1723 __global__ void set(DomainList *domainList, integer *domainListIndices, integer *domainListLevels,
1724 integer *domainListIndex, integer *domainListCounter, keyType *domainListKeys,
1725 keyType *sortedDomainListKeys, integer *relevantDomainListIndices,
1726 integer *relevantDomainListLevels, integer *relevantDomainListProcess) {
1727
1728 domainList->set(domainListIndices, domainListLevels, domainListIndex, domainListCounter, domainListKeys,
1729 sortedDomainListKeys, relevantDomainListIndices, relevantDomainListLevels,
1730 relevantDomainListProcess);
1731 }
1732
1733 __global__ void setBorders(DomainList *domainList, real *borders, integer *relevantDomainListOriginalIndex) {
1734 domainList->setBorders(borders, relevantDomainListOriginalIndex);
1735 }
1736
1737 __global__ void info(Particles *particles, DomainList *domainList) {
1738
1739 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1740 integer stride = blockDim.x * gridDim.x;
1741 integer offset = 0;
1742
1743 integer domainListIndex;
1744
1745 //if (index == 0) {
1746 // printf("domainListIndices = [");
1747 // for (int i=0; i<*domainList->domainListIndex; i++) {
1748 // printf("%i, ", domainList->domainListIndices[i]);
1749 // }
1750 // printf("]\n");
1751 //}
1752
1753#if DIM == 3
1754 while ((index + offset) < *domainList->domainListIndex) {
1755
1756 domainListIndex = domainList->domainListIndices[index + offset];
1757
1758 if (true) {
1759 printf("domainListIndices[%i] = %i, level = %i, x = (%f, %f, %f) mass = %f\n", index + offset,
1760 domainListIndex, domainList->domainListLevels[index + offset], particles->x[domainListIndex],
1761 particles->y[domainListIndex], particles->z[domainListIndex],
1762 particles->mass[domainListIndex]);
1763 }
1764
1765 offset += stride;
1766 }
1767#endif
1768
1769 //if (index == 0) {
1770 // while ((index + offset) < *domainList->domainListIndex) {
1771 // domainListIndex = domainList->domainListIndices[index + offset];
1772 // }
1773 //}
1774
1775 }
1776
1777 __global__ void info(Particles *particles, DomainList *domainList, DomainList *lowestDomainList) {
1778
1779 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1780 integer stride = blockDim.x * gridDim.x;
1781 integer offset = 0;
1782
1783 integer domainListIndex;
1784
1785 //if (index == 0) {
1786 // printf("domainListIndices = [");
1787 // for (int i=0; i<*domainList->domainListIndex; i++) {
1788 // printf("%i, ", domainList->domainListIndices[i]);
1789 // }
1790 // printf("]\n");
1791 //}
1792
1793 bool show;
1794
1795 while ((index + offset) < *domainList->domainListIndex) {
1796
1797 show = true;
1798 domainListIndex = domainList->domainListIndices[index + offset];
1799
1800 //for (int i=0; i<*lowestDomainList->domainListIndex; i++) {
1801 // if (lowestDomainList->domainListIndices[i] == domainListIndex) {
1802 // printf("domainListIndices[%i] = %i, x = (%f, %f, %f) mass = %f\n", index + offset,
1803 // domainListIndex, particles->x[domainListIndex],
1804 // particles->y[domainListIndex], particles->z[domainListIndex], particles->mass[domainListIndex]);
1805 // }
1806 //}
1807
1808 for (int i=0; i<*lowestDomainList->domainListIndex; i++) {
1809 if (lowestDomainList->domainListIndices[i] == domainListIndex) {
1810 show = false;
1811 }
1812 }
1813
1814 if (show) {
1815#if DIM == 1
1816 printf("domainListIndices[%i] = %i, x = (%f) mass = %f\n", index + offset,
1817 domainListIndex, particles->x[domainListIndex], particles->mass[domainListIndex]);
1818#elif DIM == 2
1819 printf("domainListIndices[%i] = %i, x = (%f, %f) mass = %f\n", index + offset,
1820 domainListIndex, particles->x[domainListIndex],
1821 particles->y[domainListIndex], particles->mass[domainListIndex]);
1822#else
1823 printf("domainListIndices[%i] = %i, x = (%f, %f, %f) mass = %f\n", index + offset,
1824 domainListIndex, particles->x[domainListIndex],
1825 particles->y[domainListIndex], particles->z[domainListIndex], particles->mass[domainListIndex]);
1826#endif
1827 }
1828
1829 offset += stride;
1830 }
1831
1832 }
1833
1834 /*
1835 * //TODO: parallel version
1836 * * either on CPU
1837 * * or distribute key2test = 0UL < keyMax between threads
1838 */
1839 /*
1840 __global__ void createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList,
1841 integer maxLevel, Curve::Type curveType) {
1842
1843 // workaround for fixing bug... in principle: unsigned long keyMax = (1 << 63) - 1;
1844#if DIM == 1
1845 keyType shiftValue = 1;
1846 keyType toShift = 21;
1847 keyType keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1848#elif DIM == 2
1849 keyType shiftValue = 1;
1850 keyType toShift = 42;
1851 keyType keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1852#else
1853 keyType shiftValue = 1;
1854 keyType toShift = 63;
1855 keyType keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1856 //keyType keyMax = KEY_MAX;
1857#endif
1858
1859 keyType key2test = 0UL;
1860 integer level = 1;
1861
1862 // in principle: traversing a (non-existent) octree by walking the 1D spacefilling curve (keys of the tree nodes)
1863 while (key2test < keyMax) { // TODO: createDomainList(): key2test < or <= keyMax
1864 if (subDomainKeyTree->isDomainListNode(key2test & (~0UL << (DIM * (maxLevel - level + 1))),
1865 maxLevel, level-1, curveType)) {
1866 domainList->domainListKeys[*domainList->domainListIndex] = key2test;
1867 // add domain list level
1868 domainList->domainListLevels[*domainList->domainListIndex] = level;
1869 *domainList->domainListIndex += 1;
1870 if (subDomainKeyTree->isDomainListNode(key2test, maxLevel, level, curveType)) {
1871 level++;
1872 }
1873 else {
1874 key2test = key2test + (1UL << DIM * (maxLevel - level));
1875 while (((key2test >> (DIM * (maxLevel - level))) & (keyType)(POW_DIM - 1)) == 0UL) {
1876 level--;
1877 }
1878 }
1879 } else {
1880 level--;
1881 }
1882
1883 }
1884 }*/
1885
1886 // Parallel version: testing for now ...
1887 /*
1888 __global__ void createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList,
1889 integer maxLevel, Curve::Type curveType) {
1890
1891 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1892
1893 //printf("index: %i, threadIdx.x: %i, blockIdx.x: %i, blockDim.x: %i\n", index, threadIdx.x, blockIdx.x, blockDim.x);
1894
1895 keyType key2test = 0UL;
1896 keyType keyMax;
1897 int domainListIndex;
1898
1899 key2test = (keyType)blockIdx.x << (DIM * (maxLevel - 1));
1900
1901 if (threadIdx.x == 0) {
1902
1903 domainListIndex = atomicAdd(domainList->domainListIndex, 1);
1904 domainList->domainListKeys[domainListIndex] = key2test;
1905 domainList->domainListLevels[domainListIndex] = 1;
1906 }
1907
1908 __syncthreads();
1909
1910 integer level = 2;
1911
1912 key2test += threadIdx.x << (DIM * maxLevel - 2);
1913 if (threadIdx.x == (POW_DIM - 1) && blockIdx.x == (POW_DIM - 1)) {
1914#if DIM == 1
1915 keyType shiftValue = 1;
1916 keyType toShift = 21;
1917 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1918#elif DIM == 2
1919 keyType shiftValue = 1;
1920 keyType toShift = 42;
1921 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1922#else
1923 keyType shiftValue = 1;
1924 keyType toShift = 63;
1925 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1926 //keyType keyMax = KEY_MAX;
1927#endif
1928 }
1929 else {
1930 keyMax = key2test + 1 << (DIM * maxLevel - 2);
1931 }
1932
1933 // in principle: traversing a (non-existent) octree by walking the 1D spacefilling curve (keys of the tree nodes)
1934 while (key2test < keyMax) { // TODO: createDomainList(): key2test < or <= keyMax
1935 if (subDomainKeyTree->isDomainListNode(key2test & (~0UL << (DIM * (maxLevel - level + 1))),
1936 maxLevel, level-1, curveType)) {
1937
1938 //if (level > 1) {
1939 domainListIndex = atomicAdd(domainList->domainListIndex, 1);
1940 printf("adding key2test: %lu | level = %i\n", key2test, level);
1941 domainList->domainListKeys[domainListIndex] = key2test;
1942 // add domain list level
1943 domainList->domainListLevels[domainListIndex] = level;
1944 //}
1945 //*domainList->domainListIndex += 1;
1946 if (subDomainKeyTree->isDomainListNode(key2test, maxLevel, level, curveType)) {
1947 level++;
1948 }
1949 else {
1950 key2test = key2test + (1UL << DIM * (maxLevel - level));
1951 while (((key2test >> (DIM * (maxLevel - level))) & (keyType)(POW_DIM - 1)) == 0UL) {
1952 level--;
1953 }
1954 }
1955 } else {
1956 level--;
1957 }
1958
1959 }
1960 }*/
1961
1962 __global__ void createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList,
1963 integer maxLevel, Curve::Type curveType) {
1964
1965 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1966
1967 //printf("index: %i, threadIdx.x: %i, blockIdx.x: %i, blockDim.x: %i\n", index, threadIdx.x, blockIdx.x, blockDim.x);
1968
1969 keyType key2test = 0UL;
1970 keyType keyMax;
1971 int domainListIndex;
1972
1973 key2test = (keyType)blockIdx.x << (DIM * (maxLevel - 1));
1974
1975 if (threadIdx.x == 0) {
1976
1977 domainListIndex = atomicAdd(domainList->domainListIndex, 1);
1978 domainList->domainListKeys[domainListIndex] = key2test;
1979 domainList->domainListLevels[domainListIndex] = 1;
1980 }
1981
1982 __syncthreads();
1983
1984 integer level = 2;
1985
1986 key2test += (keyType)threadIdx.x << (DIM * (maxLevel - 2));
1987 if (threadIdx.x == (POW_DIM - 1) && blockIdx.x == (POW_DIM - 1)) {
1988#if DIM == 1
1989 keyType shiftValue = 1;
1990 keyType toShift = 21;
1991 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1992#elif DIM == 2
1993 keyType shiftValue = 1;
1994 keyType toShift = 42;
1995 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
1996#else
1997 keyType shiftValue = 1;
1998 keyType toShift = 63;
1999 keyMax = (shiftValue << toShift) - 1; // 1 << 63 not working!
2000 //keyType keyMax = KEY_MAX;
2001#endif
2002 }
2003 else {
2004 keyMax = key2test + (1UL << (DIM * (maxLevel - 2)));
2005 }
2006
2007 //printf("threadIdx.x = %i, blockIdx.x = %i, key2test = %lu, keyMax = %lu\n", threadIdx.x, blockIdx.x, key2test, keyMax);
2008
2009 // in principle: traversing a (non-existent) octree by walking the 1D spacefilling curve (keys of the tree nodes)
2010 while (key2test < keyMax && level > 1) { // TODO: createDomainList(): key2test < or <= keyMax
2011 if (subDomainKeyTree->isDomainListNode(key2test & (~0UL << (DIM * (maxLevel - level + 1))),
2012 maxLevel, level-1, curveType)) {
2013
2014 //if (level > 1) {
2015 domainListIndex = atomicAdd(domainList->domainListIndex, 1);
2016 //printf("adding key2test: %lu | level = %i\n", key2test, level);
2017 domainList->domainListKeys[domainListIndex] = key2test;
2018 // add domain list level
2019 domainList->domainListLevels[domainListIndex] = level;
2020 //}
2021 //*domainList->domainListIndex += 1;
2022 if (subDomainKeyTree->isDomainListNode(key2test, maxLevel, level, curveType)) {
2023 level++;
2024 }
2025 else {
2026 key2test = key2test + (1UL << DIM * (maxLevel - level));
2027 while (((key2test >> (DIM * (maxLevel - level))) & (keyType)(POW_DIM - 1)) == 0UL) {
2028 level--;
2029 }
2030 }
2031 } else {
2032 level--;
2033 }
2034
2035 }
2036 }
2037
2038
2039 __global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2040 DomainList *domainList, DomainList *lowestDomainList,
2041 integer n, integer m) {
2042
2043 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2044 integer stride = blockDim.x * gridDim.x;
2045 integer offset = 0;
2046
2047 bool lowestDomainListNode;
2048 integer domainIndex;
2049 integer lowestDomainIndex;
2050 integer childIndex;
2051
2052 // check all domain list nodes
2053 while ((index + offset) < *domainList->domainListIndex) {
2054 lowestDomainListNode = true;
2055 // get domain list index of current domain list node
2056 domainIndex = domainList->domainListIndices[index + offset];
2057 // check all children
2058 for (int i=0; i<POW_DIM; i++) {
2059 childIndex = tree->child[POW_DIM * domainIndex + i];
2060 // check whether child exists
2061 if (childIndex != -1) {
2062 // check whether child is a node
2063 if (childIndex >= n) {
2064 // check if this node is a domain list node
2065 if (particles->nodeType[childIndex] >= 1) {
2066 lowestDomainListNode = false;
2067 }
2068 //for (int k=0; k<*domainList->domainListIndex; k++) {
2069 // if (childIndex == domainList->domainListIndices[k]) {
2070 // //printf("domainIndex = %i childIndex: %i domainListIndices: %i\n", domainIndex,
2071 // // childIndex, domainListIndices[k]);
2072 // lowestDomainListNode = false;
2073 // break;
2074 // }
2075 //}
2076 // one child being a domain list node is sufficient for not being a lowest domain list node
2077 if (!lowestDomainListNode) {
2078 break;
2079 }
2080 }
2081 }
2082 }
2083
2084 if (lowestDomainListNode) {
2085 // increment lowest domain list counter/index
2086 lowestDomainIndex = atomicAdd(lowestDomainList->domainListIndex, 1);
2087 // add/save index of lowest domain list node
2088 lowestDomainList->domainListIndices[lowestDomainIndex] = domainIndex;
2089 particles->nodeType[domainIndex] = 2;
2090 // add/save key of lowest domain list node
2091 lowestDomainList->domainListKeys[lowestDomainIndex] = domainList->domainListKeys[index + offset];
2092 // add/save level of lowest domain list node
2093 lowestDomainList->domainListLevels[lowestDomainIndex] = domainList->domainListLevels[index + offset];
2094
2095 lowestDomainList->borders[lowestDomainIndex * 2 * DIM] = domainList->borders[(index + offset) * 2 * DIM];
2096 lowestDomainList->borders[lowestDomainIndex * 2 * DIM + 1] = domainList->borders[(index + offset) * 2 * DIM + 1];
2097#if DIM > 1
2098 lowestDomainList->borders[lowestDomainIndex * 2 * DIM + 2] = domainList->borders[(index + offset) * 2 * DIM + 2];
2099 lowestDomainList->borders[lowestDomainIndex * 2 * DIM + 3] = domainList->borders[(index + offset) * 2 * DIM + 3];
2100#if DIM == 3
2101 lowestDomainList->borders[lowestDomainIndex * 2 * DIM + 4] = domainList->borders[(index + offset) * 2 * DIM + 4];
2102 lowestDomainList->borders[lowestDomainIndex * 2 * DIM + 5] = domainList->borders[(index + offset) * 2 * DIM + 5];
2103#endif
2104#endif
2105
2106 // debugging
2107 //printf("[rank %i] Adding lowest domain list node #%i : %i (key = %lu)\n", subDomainKeyTree->rank, lowestDomainIndex, domainIndex,
2108 // lowestDomainList->domainListKeys[lowestDomainIndex]);
2109 }
2110 offset += stride;
2111 }
2112
2113 }
2114
2115 void Launch::set(DomainList *domainList, integer *domainListIndices, integer *domainListLevels,
2116 integer *domainListIndex, integer *domainListCounter, keyType *domainListKeys,
2117 keyType *sortedDomainListKeys, integer *relevantDomainListIndices,
2118 integer *relevantDomainListLevels, integer *relevantDomainListProcess) {
2119 ExecutionPolicy executionPolicy(1, 1);
2120 cuda::launch(false, executionPolicy, ::DomainListNS::Kernel::set, domainList, domainListIndices, domainListLevels,
2121 domainListIndex, domainListCounter, domainListKeys, sortedDomainListKeys,
2122 relevantDomainListIndices, relevantDomainListLevels, relevantDomainListProcess);
2123 }
2124
2125 void Launch::setBorders(DomainList *domainList, real *borders, integer *relevantDomainListOriginalIndex) {
2126 ExecutionPolicy executionPolicy(1, 1);
2127 cuda::launch(false, executionPolicy, ::DomainListNS::Kernel::setBorders, domainList, borders,
2128 relevantDomainListOriginalIndex);
2129 }
2130
2131 real Launch::info(Particles *particles, DomainList *domainList) {
2132 ExecutionPolicy executionPolicy;
2133 return cuda::launch(true, executionPolicy, ::DomainListNS::Kernel::info, particles, domainList);
2134 }
2135
2136 real Launch::info(Particles *particles, DomainList *domainList, DomainList *lowestDomainList) {
2137 ExecutionPolicy executionPolicy;
2138 return cuda::launch(true, executionPolicy, ::DomainListNS::Kernel::info, particles, domainList, lowestDomainList);
2139 }
2140
2141 real Launch::createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList, integer maxLevel,
2142 Curve::Type curveType) {
2143 //TODO: is there any possibility to call kernel createDomainList() with more than one thread?
2144 ExecutionPolicy executionPolicy(POW_DIM, POW_DIM);
2145 return cuda::launch(true, executionPolicy, ::DomainListNS::Kernel::createDomainList, subDomainKeyTree,
2146 domainList, maxLevel, curveType);
2147 }
2148
2149 real Launch::lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2150 DomainList *domainList, DomainList *lowestDomainList, integer n, integer m) {
2151 ExecutionPolicy executionPolicy;
2152 return cuda::launch(true, executionPolicy, ::DomainListNS::Kernel::lowestDomainList, subDomainKeyTree,
2153 tree, particles, domainList, lowestDomainList, n, m);
2154 }
2155
2156 }
2157
2158}
2159
2160namespace ParticlesNS {
2161 __device__ bool applySphericalCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2162 real d, int index) {
2163
2164#if DIM == 1
2165 if (cuda::math::sqrt(particles->x[index] * particles->x[index]) < d) {
2166#elif DIM == 2
2167 if (cuda::math::sqrt(particles->x[index] * particles->x[index] +
2168 particles->y[index] * particles->y[index]) < d) {
2169#else
2170 if (cuda::math::sqrt(particles->x[index] * particles->x[index] + particles->y[index] * particles->y[index] +
2171 particles->z[index] * particles->z[index]) < d) {
2172#endif
2173 return false;
2174 } else {
2175 return true;
2176 }
2177 }
2178
2179 __device__ bool applyCubicCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2180 real d, int index) {
2181
2182#if DIM == 1
2183 if (cuda::math::abs(particles->x[index]) < d) {
2184#elif DIM == 2
2185 if (cuda::math::abs(particles->x[index]) < d &&
2186 cuda::math::abs(particles->y[index]) < d) {
2187#else
2188 if (cuda::math::abs(particles->x[index]) < d &&
2189 cuda::math::abs(particles->y[index]) < d &&
2190 cuda::math::abs(particles->z[index]) < d) {
2191#endif
2192 return false;
2193 } else {
2194 return true;
2195 }
2196 }
2197
2198 namespace Kernel {
2199
2200 __global__ void mark2remove(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2201 int *particles2remove, int *counter, int criterion, real d, int numParticles) {
2202
2203 int bodyIndex = threadIdx.x + blockDim.x * blockIdx.x;
2204 int stride = blockDim.x * gridDim.x;
2205 int offset = 0;
2206 bool remove;
2207
2208 while (bodyIndex + offset < numParticles) {
2209
2210 switch (criterion) {
2211 case 0: {
2212 remove = applySphericalCriterion(subDomainKeyTree, tree, particles, d, bodyIndex + offset);
2213 } break;
2214 case 1: {
2215 remove = applyCubicCriterion(subDomainKeyTree, tree, particles, d, bodyIndex + offset);
2216 } break;
2217 default: {
2218 cudaTerminate("Criterion for removing particles not available! Exiting...\n");
2219 }
2220 }
2221 if (remove) {
2222 particles2remove[bodyIndex + offset] = 1;
2223 atomicAdd(counter, 1);
2224 } else {
2225 particles2remove[bodyIndex + offset] = 0;
2226 }
2227
2228 offset += stride;
2229 }
2230
2231 }
2232
2233 real Launch::mark2remove(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2234 int *particles2remove, int *counter, int criterion, real d,
2235 int numParticles) {
2236 ExecutionPolicy executionPolicy;
2237 return cuda::launch(true, executionPolicy, ::ParticlesNS::Kernel::mark2remove, subDomainKeyTree,
2238 tree, particles, particles2remove, counter, criterion, d, numParticles);
2239 }
2240
2241 }
2242}
2243
2244namespace CudaUtils {
2245
2246 namespace Kernel {
2247
2248 template<typename T, typename U>
2249 __global__ void markDuplicatesTemp(Tree *tree, DomainList *domainList, T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length) {
2250
2251 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2252 integer stride = blockDim.x * gridDim.x;
2253 integer offset = 0;
2254 integer maxIndex;
2255
2256 bool isChild;
2257 //remark: check only x, but in principle check all
2258 while ((index + offset) < length) {
2259 if (array[index + offset] != -1) {
2260 for (integer i = 0; i < length; i++) {
2261 if (i != (index + offset)) {
2262 if (array[i] != -1 && (array[index + offset] == array[i] || (entry1[array[i]] == entry1[array[index + offset]] &&
2263 entry2[array[i]] == entry2[array[index + offset]] &&
2264 entry3[array[i]] == entry3[array[index + offset]]))) {
2265 isChild = false;
2266
2267 if (true/*array[index + offset] == array[i]*/) {
2268 printf("DUPLICATE: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
2269 array[index + offset], array[i],
2270 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
2271 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
2272
2273 for (int k=0; k<*domainList->domainListIndex; k++) {
2274 if (array[index + offset] == domainList->domainListIndices[k] ||
2275 array[i] == domainList->domainListIndices[k]) {
2276 printf("DUPLICATE is domainList!\n");
2277 }
2278 }
2279
2280 for (int k=0; k<POW_DIM; k++) {
2281 if (child[POW_DIM*array[index + offset] + k] == array[i]) {
2282 printf("isChild: index = %i: child %i == i = %i\n", array[index + offset],
2283 k, array[i]);
2284 isChild = true;
2285 }
2286
2287 if (child[8*array[i] + k] == array[index + offset]) {
2288 printf("isChild: index = %i: child %i == index = %i\n", array[i],
2289 k, array[index + offset]);
2290 isChild = true;
2291 }
2292 }
2293
2294 if (!isChild) {
2295 printf("isChild: Duplicate NOT a child: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
2296 array[index + offset], array[i],
2297 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
2298 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
2299 //for (int k=0; k<POW_DIM; k++) {
2300 // printf("isChild: Duplicate NOT a child: children index = %i: child %i == i = %i\n", array[index + offset],
2301 // k, array[i]);
2302 //
2303 // printf("isChild: Duplicate NOT a child: children index = %i: child %i == index = %i\n", array[i],
2304 // k, array[index + offset]);
2305 //
2306 //}
2307 }
2308
2309 }
2310
2311 //maxIndex = max(index + offset, i);
2312 // mark larger index with -1 (thus a duplicate)
2313 //array[maxIndex] = -1;
2314 //atomicAdd(duplicateCounter, 1);
2315 }
2316 }
2317
2318 }
2319 }
2320 offset += stride;
2321 }
2322 }
2323
2324 template <typename T, unsigned int blockSize>
2325 __global__ void reduceBlockwise(T *array, T *outputData, int n) {
2326
2327 extern __shared__ T sdata[];
2328
2329 unsigned int tid = threadIdx.x;
2330 unsigned int i = blockIdx.x*(blockSize*2) + tid;
2331 unsigned int gridSize = blockSize*2*gridDim.x;
2332
2333 sdata[tid] = 0;
2334
2335 while (i < n)
2336 {
2337 sdata[tid] += array[i] + array[i+blockSize];
2338 i += gridSize;
2339 }
2340
2341 __syncthreads();
2342
2343 if (blockSize >= 512) {
2344 if (tid < 256) {
2345 array[tid] += array[tid + 256];
2346 }
2347 __syncthreads();
2348 }
2349 if (blockSize >= 256) {
2350 if (tid < 128) {
2351 array[tid] += array[tid + 128];
2352 }
2353 __syncthreads();
2354 }
2355 if (blockSize >= 128) {
2356 if (tid < 64) {
2357 array[tid] += array[tid + 64];
2358 }
2359 __syncthreads();
2360 }
2361
2362 if (tid < 32)
2363 {
2364 if (blockSize >= 64) array[tid] += array[tid + 32];
2365 if (blockSize >= 32) array[tid] += array[tid + 16];
2366 if (blockSize >= 16) array[tid] += array[tid + 8];
2367 if (blockSize >= 8) array[tid] += array[tid + 4];
2368 if (blockSize >= 4) array[tid] += array[tid + 2];
2369 if (blockSize >= 2) array[tid] += array[tid + 1];
2370 }
2371
2372 if (tid == 0) {
2373 outputData[blockIdx.x] = array[0];
2374 }
2375 }
2376
2377 //template __global__ void reduceBlockwise<real, 256>(real *array, real *outputData, int n);
2378
2379 template <typename T, unsigned int blockSize>
2380 __global__ void blockReduction(const T *indata, T *outdata) {
2381 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2382 integer stride = blockDim.x * gridDim.x;
2383 integer offset = 0;
2384
2385 //extern __shared__ real *buff;
2386
2387 while ((index + offset) < blockSize) {
2388 atomicAdd(&outdata[0], indata[index + offset]);
2389 __threadfence();
2390 offset += stride;
2391 }
2392
2393 }
2394
2395 namespace Launch {
2396 template<typename T, typename U>
2397 real markDuplicatesTemp(Tree *tree, DomainList *domainList, T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length) {
2398 ExecutionPolicy executionPolicy;
2399 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::markDuplicatesTemp<T, U>, tree, domainList, array, entry1, entry2,
2400 entry3, duplicateCounter, child, length);
2401 }
2402 template real markDuplicatesTemp<integer, real>(Tree *tree, DomainList *domainList, integer *array, real *entry1, real *entry2, real *entry3, integer *duplicateCounter, integer *child, int length);
2403
2404 template <typename T, unsigned int blockSize>
2405 real reduceBlockwise(T *array, T *outputData, int n) {
2406 ExecutionPolicy executionPolicy(256, blockSize, blockSize * sizeof(T));
2407 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::reduceBlockwise<T, blockSize>, array, outputData, n);
2408 }
2409 template real reduceBlockwise<real, 256>(real *array, real *outputData, int n);
2410
2411 template <typename T, unsigned int blockSize>
2412 real blockReduction(const T *indata, T *outdata) {
2413 ExecutionPolicy executionPolicy(256, blockSize);
2414 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::blockReduction<T, blockSize>,
2415 indata, outdata);
2416 }
2417 template real blockReduction<real, 256>(const real *indata, real *outdata);
2418
2419 }
2420 }
2421}
2422
2423namespace Physics {
2424 namespace Kernel {
2425
2426 // see: https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
2427 template <unsigned int blockSize>
2428 __global__ void calculateAngularMomentumBlockwise(Particles *particles, real *outputData, int n) {
2429
2430 extern __shared__ real sdata[];
2431 real *lx = sdata;
2432#if DIM > 1
2433 real *ly = &sdata[blockSize];
2434#if DIM > 2
2435 real *lz = &sdata[2 * blockSize];
2436#endif
2437#endif
2438 unsigned int tid = threadIdx.x;
2439 unsigned int i = blockIdx.x*(blockSize*2) + tid;
2440 unsigned int gridSize = blockSize*2*gridDim.x;
2441
2442 lx[tid] = 0.;
2443#if DIM > 1
2444 ly[tid] = 0.;
2445#if DIM > 2
2446 lz[tid] = 0.;
2447#endif
2448#endif
2449
2450 while (i < n)
2451 {
2452// TODO: implementation for DIM == 2
2453#if DIM == 3
2454 lx[tid] += particles->mass[i] * (particles->y[i]*particles->vz[i] - particles->z[i]*particles->vy[i]) +
2455 particles->mass[i+blockSize] * (particles->y[i+blockSize]*particles->vz[i+blockSize] - particles->z[i+blockSize]*particles->vy[i+blockSize]);
2456
2457 ly[tid] += particles->mass[i] * (particles->z[i]*particles->vx[i] - particles->x[i]*particles->vz[i]) +
2458 particles->mass[i+blockSize] * (particles->z[i+blockSize]*particles->vx[i+blockSize] - particles->x[i+blockSize]*particles->vz[i+blockSize]);
2459
2460 lz[tid] += particles->mass[i] * (particles->x[i]*particles->vy[i] - particles->y[i]*particles->vx[i]) +
2461 particles->mass[i+blockSize] * (particles->x[i+blockSize]*particles->vy[i+blockSize] - particles->y[i+blockSize]*particles->vx[i+blockSize]);
2462#endif
2463
2464 i += gridSize;
2465 }
2466
2467 __syncthreads();
2468
2469 if (blockSize >= 512) {
2470 if (tid < 256) {
2471 lx[tid] += lx[tid + 256];
2472#if DIM > 1
2473 ly[tid] += ly[tid + 256];
2474#if DIM > 2
2475 lz[tid] += lz[tid + 256];
2476#endif
2477#endif
2478 }
2479 __syncthreads();
2480 }
2481 if (blockSize >= 256) {
2482 if (tid < 128) {
2483 lx[tid] += lx[tid + 128];
2484#if DIM > 1
2485 ly[tid] += ly[tid + 128];
2486#if DIM > 2
2487 lz[tid] += lz[tid + 128];
2488#endif
2489#endif
2490 }
2491 __syncthreads();
2492 }
2493 if (blockSize >= 128) {
2494 if (tid < 64) {
2495 lx[tid] += lx[tid + 64];
2496#if DIM > 1
2497 ly[tid] += ly[tid + 64];
2498#if DIM > 2
2499 lz[tid] += lz[tid + 64];
2500#endif
2501#endif
2502 }
2503 __syncthreads();
2504 }
2505
2506 if (tid < 32)
2507 {
2508 if (blockSize >= 64) {
2509 lx[tid] += lx[tid + 32];
2510#if DIM > 1
2511 ly[tid] += ly[tid + 32];
2512#if DIM > 2
2513 lz[tid] += lz[tid + 32];
2514#endif
2515#endif
2516 }
2517 if (blockSize >= 32) {
2518 lx[tid] += lx[tid + 16];
2519#if DIM > 1
2520 ly[tid] += ly[tid + 16];
2521#if DIM > 2
2522 lz[tid] += lz[tid + 16];
2523#endif
2524#endif
2525 }
2526 if (blockSize >= 16) {
2527 lx[tid] += lx[tid + 8];
2528#if DIM > 1
2529 ly[tid] += ly[tid + 8];
2530#if DIM > 2
2531 lz[tid] += lz[tid + 8];
2532#endif
2533#endif
2534 }
2535 if (blockSize >= 8) {
2536 lx[tid] += lx[tid + 4];
2537#if DIM > 1
2538 ly[tid] += ly[tid + 4];
2539#if DIM > 2
2540 lz[tid] += lz[tid + 4];
2541#endif
2542#endif
2543 }
2544 if (blockSize >= 4) {
2545 lx[tid] += lx[tid + 2];
2546#if DIM > 1
2547 ly[tid] += ly[tid + 2];
2548#if DIM > 2
2549 lz[tid] += lz[tid + 2];
2550#endif
2551#endif
2552 }
2553 if (blockSize >= 2) {
2554 lx[tid] += lx[tid + 1];
2555#if DIM > 1
2556 ly[tid] += ly[tid + 1];
2557#if DIM > 2
2558 lz[tid] += lz[tid + 1];
2559#endif
2560#endif
2561 }
2562 }
2563
2564 if (tid == 0) {
2565 outputData[blockIdx.x] = lx[0];
2566#if DIM > 1
2567 outputData[blockSize + blockIdx.x] = ly[0];
2568#if DIM > 2
2569 outputData[2 * blockSize + blockIdx.x] = lz[0];
2570#endif
2571#endif
2572 }
2573 }
2574
2575 template <unsigned int blockSize>
2576 __global__ void sumAngularMomentum(const real *indata, real *outdata) {
2577
2578 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2579 integer stride = blockDim.x * gridDim.x;
2580 integer offset = 0;
2581
2582 //extern __shared__ real *buff;
2583
2584 while ((index + offset) < blockSize) {
2585 atomicAdd(&outdata[0], indata[index + offset]);
2586#if DIM > 1
2587 atomicAdd(&outdata[1], indata[blockSize + index + offset]);
2588#if DIM > 2
2589 atomicAdd(&outdata[2], indata[2 * blockSize + index + offset]);
2590#endif
2591#endif
2592 __threadfence();
2593 offset += stride;
2594 }
2595
2596 }
2597
2598 __global__ void kineticEnergy(Particles *particles, int n) {
2599 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2600 integer stride = blockDim.x * gridDim.x;
2601 integer offset = 0;
2602 real vel;
2603
2604 while ((index + offset) < n) {
2605#if DIM == 1
2606 vel = abs(particles->vx[index + offset]);
2607#elif DIM == 2
2608 vel = cuda::math::sqrt(particles->vx[index + offset] * particles->vx[index + offset] +
2609 particles->vy[index + offset] * particles->vy[index + offset]);
2610#else
2611 vel = cuda::math::sqrt(particles->vx[index + offset] * particles->vx[index + offset] +
2612 particles->vy[index + offset] * particles->vy[index + offset] +
2613 particles->vz[index + offset] * particles->vz[index + offset]);
2614#endif
2615
2616 particles->u[index + offset] += 0.5 * particles->mass[index + offset] * vel * vel;
2617 offset += stride;
2618 }
2619 }
2620
2621 namespace Launch {
2622 template <unsigned int blockSize>
2623 real calculateAngularMomentumBlockwise(Particles *particles, real *outputData, int n) {
2624 ExecutionPolicy executionPolicy(256, blockSize, DIM * blockSize * sizeof(real));
2625 return cuda::launch(true, executionPolicy, ::Physics::Kernel::calculateAngularMomentumBlockwise<blockSize>,
2626 particles, outputData, n);
2627 }
2628 template real calculateAngularMomentumBlockwise<256>(Particles *particles, real *outputData, int n);
2629
2630 template <unsigned int blockSize>
2631 real sumAngularMomentum(const real *indata, real *outdata) {
2632 ExecutionPolicy executionPolicy(256, blockSize); //, DIM * sizeof(real));
2633 return cuda::launch(true, executionPolicy, ::Physics::Kernel::sumAngularMomentum<blockSize>,
2634 indata, outdata);
2635 }
2636 template real sumAngularMomentum<256>(const real *indata, real *outdata);
2637
2638 real kineticEnergy(Particles *particles, int n) {
2639 ExecutionPolicy executionPolicy;
2640 return cuda::launch(true, executionPolicy, ::Physics::Kernel::kineticEnergy, particles, n);
2641 }
2642 }
2643 }
2644}
DomainList
Definition: subdomain.cuh:494
DomainList::domainListKeys
keyType * domainListKeys
domain list node keys
Definition: subdomain.cuh:507
DomainList::domainListIndex
integer * domainListIndex
domain list node index, thus amount of domain list nodes
Definition: subdomain.cuh:503
DomainList::setBorders
CUDA_CALLABLE_MEMBER void setBorders(real *borders, integer *relevantDomainListOriginalIndex)
Definition: subdomain.cu:1714
DomainList::sortedDomainListKeys
keyType * sortedDomainListKeys
sorted domain list node keys, usable as output for sorting the keys
Definition: subdomain.cuh:509
DomainList::DomainList
CUDA_CALLABLE_MEMBER DomainList()
Constructor.
Definition: subdomain.cu:1674
DomainList::relevantDomainListOriginalIndex
integer * relevantDomainListOriginalIndex
Definition: subdomain.cuh:517
DomainList::domainListIndices
integer * domainListIndices
domain list node indices
Definition: subdomain.cuh:499
DomainList::domainListLevels
integer * domainListLevels
domain list node levels
Definition: subdomain.cuh:501
DomainList::relevantDomainListIndices
integer * relevantDomainListIndices
concentrate domain list nodes, usable to reduce domain list indices in respect to some criterion
Definition: subdomain.cuh:511
DomainList::domainListCounter
integer * domainListCounter
domain list node counter, usable as buffer
Definition: subdomain.cuh:505
DomainList::relevantDomainListProcess
integer * relevantDomainListProcess
Definition: subdomain.cuh:515
DomainList::set
CUDA_CALLABLE_MEMBER void set(integer *domainListIndices, integer *domainListLevels, integer *domainListIndex, integer *domainListCounter, keyType *domainListKeys, keyType *sortedDomainListKeys, integer *relevantDomainListIndices, integer *relevantDomainListLevels, integer *relevantDomainListProcess)
Setter, passing pointer to member variables.
Definition: subdomain.cu:1696
DomainList::borders
real * borders
Definition: subdomain.cuh:519
DomainList::~DomainList
CUDA_CALLABLE_MEMBER ~DomainList()
Destructor.
Definition: subdomain.cu:1692
DomainList::relevantDomainListLevels
integer * relevantDomainListLevels
Definition: subdomain.cuh:513
ExecutionPolicy
Execution policy/instruction for CUDA kernel execution.
Definition: cuda_launcher.cuh:33
Helper
Definition: helper.cuh:24
Helper::integerBuffer
integer * integerBuffer
Definition: helper.cuh:55
Helper::integerVal
integer * integerVal
Definition: helper.cuh:45
Helper::keyTypeBuffer
keyType * keyTypeBuffer
Definition: helper.cuh:70
Particles
Particle(s) class based on SoA (Structur of Arrays).
Definition: particles.cuh:50
Particles::x
real * x
(pointer to) x position (array)
Definition: particles.cuh:62
Particles::level
integer * level
(pointer to) level of the (pseudo-)particles
Definition: particles.cuh:106
Particles::g_ay
real * g_ay
(pointer to) y gravitational acceleration (array)
Definition: particles.cuh:92
Particles::ay
real * ay
(pointer to) y acceleration (array)
Definition: particles.cuh:74
Particles::nodeType
integer * nodeType
(pointer to) node type
Definition: particles.cuh:103
Particles::az
real * az
(pointer to) z acceleration (array)
Definition: particles.cuh:82
Particles::y
real * y
(pointer to) y position (array)
Definition: particles.cuh:70
Particles::ax
real * ax
(pointer to) x acceleration (array)
Definition: particles.cuh:66
Particles::u
real * u
energy (kinetic + gravitational for now)
Definition: particles.cuh:126
Particles::g_ax
real * g_ax
(pointer to) x gravitational acceleration (array)
Definition: particles.cuh:88
Particles::mass
real * mass
(pointer to) mass (array)
Definition: particles.cuh:60
Particles::z
real * z
(pointer to) z position (array)
Definition: particles.cuh:78
Particles::vz
real * vz
(pointer to) z velocity (array)
Definition: particles.cuh:80
Particles::g_az
real * g_az
(pointer to) z gravitational acceleration (array)
Definition: particles.cuh:96
Particles::vx
real * vx
(pointer to) x velocity (array)
Definition: particles.cuh:64
Particles::vy
real * vy
(pointer to) y velocity (array)
Definition: particles.cuh:72
SubDomainKeyTree
SubDomainKeyTree class handling rank, number of processes and ranges.
Definition: subdomain.cuh:62
SubDomainKeyTree::range
keyType * range
Space-filling curve ranges, mapping key ranges/borders to MPI processes.
Definition: subdomain.cuh:70
SubDomainKeyTree::~SubDomainKeyTree
CUDA_CALLABLE_MEMBER ~SubDomainKeyTree()
Destructor.
Definition: subdomain.cu:32
SubDomainKeyTree::key2proc
CUDA_CALLABLE_MEMBER integer key2proc(keyType key)
Compute particle's MPI process belonging by it's key.
Definition: subdomain.cu:68
SubDomainKeyTree::isDomainListNode
CUDA_CALLABLE_MEMBER bool isDomainListNode(keyType key, integer maxLevel, integer level, Curve::Type curveType=Curve::lebesgue)
Check whether key, thus particle, represents a domain list node.
Definition: subdomain.cu:79
SubDomainKeyTree::set
CUDA_CALLABLE_MEMBER void set(integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Setter.
Definition: subdomain.cu:36
SubDomainKeyTree::procParticleCounter
integer * procParticleCounter
particle counter in dependence of MPI process(es)
Definition: subdomain.cuh:73
SubDomainKeyTree::rank
integer rank
MPI rank.
Definition: subdomain.cuh:66
SubDomainKeyTree::SubDomainKeyTree
CUDA_CALLABLE_MEMBER SubDomainKeyTree()
Default Constructor.
Definition: subdomain.cu:21
SubDomainKeyTree::numProcesses
integer numProcesses
MPI number of processes.
Definition: subdomain.cuh:68
Tree
Tree class.
Definition: tree.cuh:140
cudaTerminate
#define cudaTerminate(...)
Definition: cuda_utilities.cuh:70
CUDA_CALLABLE_MEMBER
#define CUDA_CALLABLE_MEMBER
Definition: cuda_utilities.cuh:30
CudaUtils::Kernel::Launch::reduceBlockwise< real, 256 >
template real reduceBlockwise< real, 256 >(real *array, real *outputData, int n)
CudaUtils::Kernel::Launch::markDuplicatesTemp< integer, real >
template real markDuplicatesTemp< integer, real >(Tree *tree, DomainList *domainList, integer *array, real *entry1, real *entry2, real *entry3, integer *duplicateCounter, integer *child, int length)
CudaUtils::Kernel::Launch::blockReduction< real, 256 >
template real blockReduction< real, 256 >(const real *indata, real *outdata)
CudaUtils::Kernel::markDuplicatesTemp
__global__ void markDuplicatesTemp(Tree *tree, DomainList *domainList, T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length)
Definition: subdomain.cu:2249
CudaUtils::Kernel::reduceBlockwise
__global__ void reduceBlockwise(T *array, T *outputData, int n)
Definition: subdomain.cu:2325
CudaUtils::Kernel::blockReduction
__global__ void blockReduction(const T *indata, T *outdata)
Definition: subdomain.cu:2380
CudaUtils
Definition: cuda_utilities.cuh:100
DomainListNS::Kernel::lowestDomainList
__global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Kernel to create the lowest domain list.
Definition: subdomain.cu:2039
DomainListNS::Kernel::setBorders
__global__ void setBorders(DomainList *domainList, real *borders, integer *relevantDomainListOriginalIndex)
Definition: subdomain.cu:1733
DomainListNS::Kernel::createDomainList
__global__ void createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Kernel to create the domain list.
Definition: subdomain.cu:1962
DomainListNS
Definition: subdomain.cuh:565
HelperNS::sortArray
real sortArray(A *arrayToSort, A *sortedArray, B *keyIn, B *keyOut, integer n)
Definition: helper.cu:151
Kernel
Definition: device_rhs.cuh:7
KeyNS::key2proc
CUDA_CALLABLE_MEMBER integer key2proc(keyType key, SubDomainKeyTree *subDomainKeyTree)
Convert the key to the corresponding process.
Definition: subdomain.cu:17
KeyNS::lebesgue2hilbert
CUDA_CALLABLE_MEMBER keyType lebesgue2hilbert(keyType lebesgue, integer maxLevel)
Convert a Lebesgue key to a Hilbert key.
Definition: tree.cu:4
KeyNS::key2Char
CUDA_CALLABLE_MEMBER void key2Char(keyType key, integer maxLevel, char *keyAsChar)
Convert a key to a char for printing.
Definition: subdomain.cu:5
MaterialNS::Kernel::info
__global__ void info(Material *material)
Debug kernel giving information about material(s).
Definition: material.cu:24
ParticlesNS::Kernel::test
__global__ void test(Particles *particles)
Definition: particles.cu:902
ParticlesNS::Kernel::mark2remove
__global__ void mark2remove(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, int *particles2remove, int *counter, int criterion, real d, int numParticles)
Definition: subdomain.cu:2200
ParticlesNS
Particle class related functions and kernels.
Definition: particles.cuh:552
ParticlesNS::applySphericalCriterion
__device__ bool applySphericalCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, real d, int index)
Check whether particle(s) are within sphere from simulation center.
Definition: subdomain.cu:2161
ParticlesNS::applyCubicCriterion
__device__ bool applyCubicCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, real d, int index)
Check whether particle(s) are within cube from simulation center.
Definition: subdomain.cu:2179
Physics::Kernel::Launch::sumAngularMomentum< 256 >
template real sumAngularMomentum< 256 >(const real *indata, real *outdata)
Physics::Kernel::Launch::calculateAngularMomentumBlockwise< 256 >
template real calculateAngularMomentumBlockwise< 256 >(Particles *particles, real *outputData, int n)
Physics::Kernel::calculateAngularMomentumBlockwise
__global__ void calculateAngularMomentumBlockwise(Particles *particles, real *outputData, int n)
Calculate angular momentum for all particles (per block).
Definition: subdomain.cu:2428
Physics::Kernel::kineticEnergy
__global__ void kineticEnergy(Particles *particles, int n)
Calculate kinetic energy.
Definition: subdomain.cu:2598
Physics::Kernel::sumAngularMomentum
__global__ void sumAngularMomentum(const real *indata, real *outdata)
Calculate angular momentum: sum over blocks.
Definition: subdomain.cu:2576
Physics
Physics related functions and kernels.
Definition: subdomain.cuh:758
ProfilerIds::Time::Gravity::repairTree
const char *const repairTree
Definition: h5profiler.h:93
ProfilerIds::Time::tree
const char *const tree
Definition: h5profiler.h:57
ProfilerIds::numParticles
const char *const numParticles
Definition: h5profiler.h:29
SubDomainKeyTreeNS::Kernel::Launch::compLocalPseudoParticles
real compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n)
Wrapper for SubDomainKeyTreeNS::Kernel::compLocalPseudoParticles().
Definition: subdomain.cu:1622
SubDomainKeyTreeNS::Kernel::Launch::particlesPerProcess
real particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::particlesPerProcess().
Definition: subdomain.cu:1572
SubDomainKeyTreeNS::Kernel::Launch::updateLowestDomainListNodes
real updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::updateLowestDomainListNodes().
Definition: subdomain.cu:1605
SubDomainKeyTreeNS::Kernel::Launch::compDomainListPseudoParticlesPerLevel
real compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int level)
Wrapper for SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticlesPerLevel().
Definition: subdomain.cu:1628
SubDomainKeyTreeNS::Kernel::Launch::compDomainListPseudoParticles
real compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n)
Wrapper for SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticles().
Definition: subdomain.cu:1635
SubDomainKeyTreeNS::Kernel::Launch::prepareLowestDomainExchange
real prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::prepareLowestDomainExchange().
Definition: subdomain.cu:1594
SubDomainKeyTreeNS::Kernel::Launch::zeroDomainListNodes
real zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::zeroDomainListNodes().
Definition: subdomain.cu:1587
SubDomainKeyTreeNS::Kernel::Launch::buildDomainTree
real buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m)
Wrapper for SubDomainKeyTreeNS::Kernel::buildDomainTree().
Definition: subdomain.cu:1550
SubDomainKeyTreeNS::Kernel::Launch::createKeyHistRanges
real createKeyHistRanges(Helper *helper, integer bins)
Definition: subdomain.cu:1650
SubDomainKeyTreeNS::Kernel::Launch::repairTree
real repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m, Curve::Type curveType)
Wrapper for SubDomainKeyTreeNS::Kernel::repairTree().
Definition: subdomain.cu:1642
SubDomainKeyTreeNS::Kernel::Launch::getParticleKeys
real getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::getParticleKeys().
Definition: subdomain.cu:1564
SubDomainKeyTreeNS::Kernel::Launch::keyHistCounter
real keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1655
SubDomainKeyTreeNS::Kernel::Launch::markParticlesProcess
real markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, integer *sortArray, Curve::Type curveType=Curve::lebesgue)
Wrapper for ::SubDomainKeyTreeNS::Kernel::markParticlesPerProcess().
Definition: subdomain.cu:1579
SubDomainKeyTreeNS::Kernel::Launch::compLowestDomainListNodes
real compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::compLowestDomainListNodes().
Definition: subdomain.cu:1616
SubDomainKeyTreeNS::Kernel::Launch::set
void set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Wrapper for SubDomainKeyTreeNS::Kernel::set().
Definition: subdomain.cu:1538
SubDomainKeyTreeNS::Kernel::Launch::test
void test(SubDomainKeyTree *subDomainKeyTree)
Wrapper for SubDomainKeyTreeNS::Kernel::test().
Definition: subdomain.cu:1545
SubDomainKeyTreeNS::Kernel::Launch::calculateNewRange
real calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1662
SubDomainKeyTreeNS::Kernel::markParticlesProcess
__global__ void markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, integer *sortArray, Curve::Type curveType=Curve::lebesgue)
Kernel to mark particle's belonging.
Definition: subdomain.cu:921
SubDomainKeyTreeNS::Kernel::test
__global__ void test(SubDomainKeyTree *subDomainKeyTree)
Test kernel call (for debugging/testing purposes).
Definition: subdomain.cu:142
SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticles
__global__ void compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n)
Definition: subdomain.cu:1291
SubDomainKeyTreeNS::Kernel::getParticleKeys
__global__ void getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle keys (and additional information for debugging purposes).
Definition: subdomain.cu:849
SubDomainKeyTreeNS::Kernel::repairTree
__global__ void repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m, Curve::Type curveType)
Repair tree by removing received and inserted (pseudo-)particles.
Definition: subdomain.cu:1355
SubDomainKeyTreeNS::Kernel::set
__global__ void set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Kernel call to setter.
Definition: subdomain.cu:137
SubDomainKeyTreeNS::Kernel::compLowestDomainListNodes
__global__ void compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList)
Compute/Find lowest domain list nodes.
Definition: subdomain.cu:1087
SubDomainKeyTreeNS::Kernel::prepareLowestDomainExchange
__global__ void prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Prepare lowest domain exchange via MPI by copying to contiguous memory.
Definition: subdomain.cu:995
SubDomainKeyTreeNS::Kernel::updateLowestDomainListNodes
__global__ void updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Update lowest domain list nodes.
Definition: subdomain.cu:1034
SubDomainKeyTreeNS::Kernel::particlesPerProcess
__global__ void particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, Curve::Type curveType=Curve::lebesgue)
Kernel to check particle's belonging and count in dependence of belonging/process.
Definition: subdomain.cu:897
SubDomainKeyTreeNS::Kernel::keyHistCounter
__global__ void keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1479
SubDomainKeyTreeNS::Kernel::compLocalPseudoParticles
__global__ void compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n)
Compute local (tree) pseudo particles.
Definition: subdomain.cu:1139
SubDomainKeyTreeNS::Kernel::calculateNewRange
__global__ void calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
Definition: subdomain.cu:1507
SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticlesPerLevel
__global__ void compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int level)
Compute domain list pseudo particles (per level).
Definition: subdomain.cu:1175
SubDomainKeyTreeNS::Kernel::buildDomainTree
__global__ void buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m)
Kernel to build the domain tree.
Definition: subdomain.cu:148
SubDomainKeyTreeNS::Kernel::zeroDomainListNodes
__global__ void zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList)
Zero domain list nodes.
Definition: subdomain.cu:946
SubDomainKeyTreeNS::Kernel::createKeyHistRanges
__global__ void createKeyHistRanges(Helper *helper, integer bins)
Definition: subdomain.cu:1459
SubDomainKeyTreeNS
SubDomainKeyTree related functions and kernels.
Definition: subdomain.cuh:127
cuda::math::sqrt
__device__ real sqrt(real a)
Square root of a floating point value.
Definition: cuda_utilities.cu:456
cuda::math::abs
__device__ real abs(real a)
Absolute value of a floating point value.
Definition: cuda_utilities.cu:448
cuda::set
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
Definition: cuda_runtime.h:56
cuda::launch
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
Definition: cuda_launcher.cuh:114
real
double real
Definition: parameter.h:15
keyType
unsigned long keyType
Definition: parameter.h:18
integer
int integer
Definition: parameter.h:17
MAX_LEVEL
#define MAX_LEVEL
Definition: parameter.h:25
KEY_MAX
#define KEY_MAX
Definition: parameter.h:102
POW_DIM
#define POW_DIM
Definition: parameter.h:40
DIM
#define DIM
Dimension of the problem.
Definition: parameter.h:38
Curve::Type
Type
Definition: parameter.h:206
Curve::lebesgue
@ lebesgue
Definition: parameter.h:207
Curve::hilbert
@ hilbert
Definition: parameter.h:207
Entry::Name
Name
Definition: parameter.h:255
Entry::mass
@ mass
Definition: parameter.h:263
Entry::y
@ y
Definition: parameter.h:258
Entry::x
@ x
Definition: parameter.h:256
Entry::z
@ z
Definition: parameter.h:260

milupHPC - src/subdomain_key_tree/subdomain.cu Source File
Generated on Wed Aug 31 2022 12:16:53 by Doxygen 1.9.3