milupHPC documentation
  • src
  • subdomain_key_tree
tree.cu
Go to the documentation of this file.
1#include "../../include/subdomain_key_tree/tree.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
3
4CUDA_CALLABLE_MEMBER keyType KeyNS::lebesgue2hilbert(keyType lebesgue, integer maxLevel) {
5
6 //keyType hilbert = 0UL;
7 //integer dir = 0;
8 //for (integer lvl=maxLevel; lvl>0; lvl--) {
9 // keyType cell = (lebesgue >> ((lvl-1)*DIM)) & (keyType)((1<<DIM)-1);
10 // hilbert = hilbert << DIM;
11 // if (lvl > 0) {
12 // hilbert += HilbertTable[dir][cell];
13 // }
14 // dir = DirTable[dir][cell];
15 //}
16 //return hilbert;
17
18 keyType hilbert = 1UL;
19 int level = 0, dir = 0;
20 for (keyType tmp=lebesgue; tmp>1; level++) {
21 tmp>>=DIM;
22 }
23 if (level == 0) {
24 hilbert = 0UL;
25 }
26 for (; level>0; level--) {
27 int cell = (lebesgue >> ((level-1)*DIM)) & ((1<<DIM)-1);
28 hilbert = (hilbert<<DIM) + HilbertTable[dir][cell];
29 dir = DirTable[dir][cell];
30 }
31
32 //if (lebesgue == 0UL) {
33 // printf("HERE: lebesgue = %lu --> level = %i, hilbert = %lu\n", lebesgue, rememberLevel, hilbert);
34 //}
35
36 return hilbert;
37
38}
39
40CUDA_CALLABLE_MEMBER keyType KeyNS::lebesgue2hilbert(keyType lebesgue, int maxLevel, int level) {
41
42 keyType hilbert = 0UL; // 0UL is our root, placeholder bit omitted
43 int dir = 0;
44 for (int lvl=maxLevel; lvl>0; lvl--) {
45 int cell = (lebesgue >> ((lvl-1)*DIM)) & (keyType)((1<<DIM)-1);
46 hilbert = hilbert<<DIM;
47 if (lvl>maxLevel-level) {
48 hilbert += HilbertTable[dir][cell];
49 }
50 dir = DirTable[dir][cell];
51 }
52 return hilbert;
53}
54
55CUDA_CALLABLE_MEMBER Tree::Tree() {
56
57}
58
59CUDA_CALLABLE_MEMBER Tree::Tree(integer *count, integer *start, integer *child, integer *sorted, integer *index,
60 integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX) : count(count),
61 start(start), child(child), sorted(sorted), index(index), toDeleteLeaf(toDeleteLeaf),
62 toDeleteNode(toDeleteNode), minX(minX), maxX(maxX) {
63
64}
65CUDA_CALLABLE_MEMBER void Tree::set(integer *count, integer *start, integer *child, integer *sorted,
66 integer *index, integer *toDeleteLeaf, integer *toDeleteNode,
67 real *minX, real *maxX) {
68 this->count = count;
69 this->start = start;
70 this->child = child;
71 this->sorted = sorted;
72 this->index = index;
73 this->toDeleteNode = toDeleteNode;
74 this->toDeleteLeaf = toDeleteLeaf;
75 this->minX = minX;
76 this->maxX = maxX;
77}
78
79#if DIM > 1
80CUDA_CALLABLE_MEMBER Tree::Tree(integer *count, integer *start, integer *child, integer *sorted, integer *index,
81 integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX, real *minY,
82 real *maxY) : count(count), start(start), child(child), sorted(sorted), index(index),
83 toDeleteLeaf(toDeleteLeaf), toDeleteNode(toDeleteNode), minX(minX), maxX(maxX),
84 minY(minY), maxY(maxY) {
85
86}
87CUDA_CALLABLE_MEMBER void Tree::set(integer *count, integer *start, integer *child, integer *sorted,
88 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX,
89 real *maxX, real *minY, real *maxY) {
90 this->count = count;
91 this->start = start;
92 this->child = child;
93 this->sorted = sorted;
94 this->index = index;
95 this->toDeleteNode = toDeleteNode;
96 this->toDeleteLeaf = toDeleteLeaf;
97 this->minX = minX;
98 this->maxX = maxX;
99 this->minY = minY;
100 this->maxY = maxY;
101}
102
103#if DIM == 3
104CUDA_CALLABLE_MEMBER Tree::Tree(integer *count, integer *start, integer *child, integer *sorted, integer *index,
105 integer *toDeleteLeaf, integer *toDeleteNode,
106 real *minX, real *maxX, real *minY, real *maxY, real *minZ, real *maxZ) : count(count),
107 start(start), child(child), sorted(sorted), index(index), toDeleteLeaf(toDeleteLeaf),
108 toDeleteNode(toDeleteNode), minX(minX), maxX(maxX), minY(minY), maxY(maxY), minZ(minZ),
109 maxZ(maxZ) {
110
111}
112CUDA_CALLABLE_MEMBER void Tree::set(integer *count, integer *start, integer *child, integer *sorted,
113 integer *index, integer *toDeleteLeaf, integer *toDeleteNode,
114 real *minX, real *maxX, real *minY, real *maxY,
115 real *minZ, real *maxZ) {
116 this->count = count;
117 this->start = start;
118 this->child = child;
119 this->sorted = sorted;
120 this->index = index;
121 this->toDeleteNode = toDeleteNode;
122 this->toDeleteLeaf = toDeleteLeaf;
123 this->minX = minX;
124 this->maxX = maxX;
125 this->minY = minY;
126 this->maxY = maxY;
127 this->minZ = minZ;
128 this->maxZ = maxZ;
129}
130#endif
131#endif
132
133CUDA_CALLABLE_MEMBER void Tree::reset(integer index, integer n) {
134#if DIM == 1
135 #pragma unroll 2
136#elif DIM == 2
137 #pragma unroll 4
138#else
139 #pragma unroll 8
140#endif
141 for (integer i=0; i<POW_DIM; i++) {
142 // reset child indices
143 child[index * POW_DIM + i] = -1;
144 }
145 // reset counter in dependence of being a node or a leaf
146 if (index < n) {
147 count[index] = 1;
148 }
149 else {
150 count[index] = 0;
151 }
152 // reset start
153 start[index] = -1;
154 sorted[index] = 0;
155}
156
157CUDA_CALLABLE_MEMBER keyType Tree::getParticleKey(Particles *particles, integer index, integer maxLevel,
158 Curve::Type curveType) {
159
160 integer level = 0;
161 keyType particleKey = (keyType)0;
162
163 integer sonBox;
164 real min_x = *minX;
165 real max_x = *maxX;
166#if DIM > 1
167 real min_y = *minY;
168 real max_y = *maxY;
169#if DIM == 3
170 real min_z = *minZ;
171 real max_z = *maxZ;
172#endif
173#endif
174
175 integer particleLevel;
176 integer particleLevelTemp = 0;
177 integer childIndex = 0;
178 // calculate path to the particle's position assuming an (oct-)tree with above bounding boxes
179 while (level <= maxLevel) {
180 sonBox = 0;
181 // find insertion point for body
182 if (particles->x[index] < 0.5 * (min_x + max_x)) {
183 sonBox += 1;
184 max_x = 0.5 * (min_x + max_x);
185 }
186 else { min_x = 0.5 * (min_x + max_x); }
187#if DIM > 1
188 if (particles->y[index] < 0.5 * (min_y+max_y)) {
189 sonBox += 2;
190 max_y = 0.5 * (min_y + max_y);
191 }
192 else { min_y = 0.5 * (min_y + max_y); }
193#if DIM == 3
194 if (particles->z[index] < 0.5 * (min_z+max_z)) {
195 sonBox += 4;
196 max_z = 0.5 * (min_z + max_z);
197 }
198 else { min_z = 0.5 * (min_z + max_z); }
199#endif
200#endif
201 particleKey = particleKey | ((keyType)sonBox << (keyType)(DIM * (maxLevel-level-1)));
202 level++;
203
204 particleLevelTemp++;
205 if (childIndex == index) {
206 particleLevel = particleLevelTemp;
207 }
208 //for (int i_child = 0; i_child < POW_DIM; i_child++) {
209 // if (child[POW_DIM * childIndex + i_child] == index) {
210 // printf("found index = %i for child[8 * %i + %i] = %i\n", index, childIndex, i_child, child[POW_DIM * childIndex + i_child]);
211 // break;
212 // }
213 //}
214 childIndex = child[POW_DIM * childIndex + sonBox];
215 }
216
217 //if (particleLevel == 0) {
218 // printf("particleLevel = %i particleLevelTemp = %i index = %i (%f, %f, %f)\n", particleLevel, particleLevelTemp, index,
219 // particles->x[index], particles->y[index], particles->z[index]);
220 //}
221
222 //if (particleKey == 0UL) {
223 // printf("Why key = %lu? x = (%f, %f, %f) min = (%f, %f, %f), max = (%f, %f, %f)\n", particleKey,
224 // particles->x[index], particles->y[index], particles->z[index],
225 // *minX, *minY, *minZ, *maxX, *maxY, *maxZ);
226 //}
227
228 switch (curveType) {
229 case Curve::lebesgue: {
230 return particleKey;
231 }
232 case Curve::hilbert: {
233 return KeyNS::lebesgue2hilbert(particleKey, maxLevel, maxLevel);
234 //return KeyNS::lebesgue2hilbert(particleKey, maxLevel);
235 }
236 default:
237 printf("Curve type not available!\n");
238 return (keyType)0;
239 }
240}
241
242CUDA_CALLABLE_MEMBER integer Tree::getTreeLevel(Particles *particles, integer index, integer maxLevel,
243 Curve::Type curveType) {
244
245 keyType key = getParticleKey(particles, index, maxLevel); //, curveType); //TODO: hilbert working for lebesgue: why???
246 integer level = 0;
247 integer childIndex;
248
249 integer path[MAX_LEVEL];
250 for (integer i=0; i<maxLevel; i++) {
251 path[i] = (integer) (key >> (maxLevel * DIM - DIM * (i + 1)) & (integer)(POW_DIM - 1));
252 //printf("path[%i] = %i\n", i, path[i]);
253 }
254
255 childIndex = 0;
256
257 for (integer i=0; i<maxLevel; i++) {
258 level++;
259 if (childIndex == index) {
260 return level;
261 }
262 childIndex = child[POW_DIM * childIndex + path[i]];
263 }
264
265#if DIM == 3
266#if SAFETY_LEVEL > 1
267 printf("ATTENTION: level = -1 (index = %i x = (%f, %f, %f) %f) tree index = %i\n",
268 index, particles->x[index], particles->y[index], particles->z[index], particles->mass[index], *this->index);
269#endif
270#endif
271
272 //for (integer i=0; i<maxLevel; i++) {
273 // childIndex = child[POW_DIM * childIndex + path[i]];
274 // for (int k=0; k<8; k++) {
275 // if (child[POW_DIM * childIndex + k] == index) {
276 // printf("FOUND index = %i in level %i for child = %i x = (%f, %f, %f) ((%i, %i), (%i, %i))\n", index, i, k,
277 // particles->x[index], particles->y[index], particles->z[index],
278 // toDeleteLeaf[0], toDeleteLeaf[1], toDeleteNode[0], toDeleteNode[1]);
279 // }
280 // }
281 // //printf("index = %i, path[%i] = %i, childIndex = %i\n", index, i, path[i], childIndex);
282 //}
283
284 return -1;
285}
286
287//TODO: is this still working? since count only used within buildTree (probably yes)
288CUDA_CALLABLE_MEMBER integer Tree::sumParticles() {
289 integer sumParticles = 0;
290 // sum over first level tree count values
291 for (integer i=0; i<POW_DIM; i++) {
292 sumParticles += count[child[i]];
293 }
294 printf("sumParticles = %i\n", sumParticles);
295 return sumParticles;
296}
297
298CUDA_CALLABLE_MEMBER Tree::~Tree() {
299
300}
301
302__global__ void TreeNS::Kernel::computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n,
303 integer blockSize) {
304
305 integer index = threadIdx.x + blockDim.x * blockIdx.x;
306 integer stride = blockDim.x * gridDim.x;
307
308 real x_min = particles->x[index];
309 real x_max = particles->x[index];
310#if DIM > 1
311 real y_min = particles->y[index];
312 real y_max = particles->y[index];
313#if DIM == 3
314 real z_min = particles->z[index];
315 real z_max = particles->z[index];
316#endif
317#endif
318
319 extern __shared__ real buffer[];
320
321 real* x_min_buffer = (real*)buffer;
322 real* x_max_buffer = (real*)&x_min_buffer[blockSize];
323#if DIM > 1
324 real* y_min_buffer = (real*)&x_max_buffer[blockSize];
325 real* y_max_buffer = (real*)&y_min_buffer[blockSize];
326#if DIM == 3
327 real* z_min_buffer = (real*)&y_max_buffer[blockSize];
328 real* z_max_buffer = (real*)&z_min_buffer[blockSize];
329#endif
330#endif
331
332 integer offset = stride;
333
334 // find (local) min/max
335 while (index + offset < n) {
336
337 x_min = cuda::math::min(x_min, particles->x[index + offset]);
338 x_max = cuda::math::max(x_max, particles->x[index + offset]);
339#if DIM > 1
340 y_min = cuda::math::min(y_min, particles->y[index + offset]);
341 y_max = cuda::math::max(y_max, particles->y[index + offset]);
342#if DIM == 3
343 z_min = cuda::math::min(z_min, particles->z[index + offset]);
344 z_max = cuda::math::max(z_max, particles->z[index + offset]);
345#endif
346#endif
347
348 offset += stride;
349 }
350
351 // save value in corresponding buffer
352 x_min_buffer[threadIdx.x] = x_min;
353 x_max_buffer[threadIdx.x] = x_max;
354#if DIM > 1
355 y_min_buffer[threadIdx.x] = y_min;
356 y_max_buffer[threadIdx.x] = y_max;
357#if DIM == 3
358 z_min_buffer[threadIdx.x] = z_min;
359 z_max_buffer[threadIdx.x] = z_max;
360#endif
361#endif
362
363 // synchronize threads / wait for unfinished threads
364 __syncthreads();
365
366 integer i = blockDim.x/2; // assuming blockDim.x is a power of 2!
367
368 // reduction within block
369 while (i != 0) {
370 if (threadIdx.x < i) {
371 x_min_buffer[threadIdx.x] = cuda::math::min(x_min_buffer[threadIdx.x], x_min_buffer[threadIdx.x + i]);
372 x_max_buffer[threadIdx.x] = cuda::math::max(x_max_buffer[threadIdx.x], x_max_buffer[threadIdx.x + i]);
373#if DIM > 1
374 y_min_buffer[threadIdx.x] = cuda::math::min(y_min_buffer[threadIdx.x], y_min_buffer[threadIdx.x + i]);
375 y_max_buffer[threadIdx.x] = cuda::math::max(y_max_buffer[threadIdx.x], y_max_buffer[threadIdx.x + i]);
376#if DIM == 3
377 z_min_buffer[threadIdx.x] = cuda::math::min(z_min_buffer[threadIdx.x], z_min_buffer[threadIdx.x + i]);
378 z_max_buffer[threadIdx.x] = cuda::math::max(z_max_buffer[threadIdx.x], z_max_buffer[threadIdx.x + i]);
379#endif
380#endif
381
382 }
383 __syncthreads();
384 i /= 2;
385 }
386
387 // combining the results and generate the root cell
388 if (threadIdx.x == 0) {
389 while (atomicCAS(mutex, 0 ,1) != 0); // lock
390
391 *tree->minX = cuda::math::min(*tree->minX, x_min_buffer[0]);
392 *tree->maxX = cuda::math::max(*tree->maxX, x_max_buffer[0]);
393#if DIM > 1
394 *tree->minY = cuda::math::min(*tree->minY, y_min_buffer[0]);
395 *tree->maxY = cuda::math::max(*tree->maxY, y_max_buffer[0]);
396
397#if CUBIC_DOMAINS
398 if (*tree->minY < *tree->minX) {
399 *tree->minX = *tree->minY;
400 }
401 else {
402 *tree->minY = *tree->minX;
403 }
404 if (*tree->maxY > *tree->maxX) {
405 *tree->maxX = *tree->maxY;
406 }
407 else {
408 *tree->maxY = *tree->maxX;
409 }
410#endif
411
412#if DIM == 3
413 *tree->minZ = cuda::math::min(*tree->minZ, z_min_buffer[0]);
414 *tree->maxZ = cuda::math::max(*tree->maxZ, z_max_buffer[0]);
415
416#if CUBIC_DOMAINS
417 if (*tree->minZ < *tree->minX) {
418 *tree->minX = *tree->minZ;
419 *tree->minY = *tree->minZ;
420 }
421 else {
422 *tree->minZ = *tree->minX;
423 }
424 if (*tree->maxZ > *tree->maxX) {
425 *tree->maxX = *tree->maxZ;
426 *tree->maxY = *tree->maxZ;
427 }
428 else {
429 *tree->maxZ = *tree->maxX;
430 }
431#endif
432
433#endif
434#endif
435 atomicExch(mutex, 0); // unlock
436 }
437}
438
439// just a wrapper for the member function
440__global__ void TreeNS::Kernel::sumParticles(Tree *tree) {
441
442 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
443 integer stride = blockDim.x * gridDim.x;
444 integer offset = 0;
445
446 if (bodyIndex == 0) {
447 integer sumParticles = tree->sumParticles();
448 printf("sumParticles = %i\n", sumParticles);
449 }
450}
451
452#define COMPUTE_DIRECTLY 0
453
454/*
455__global__ void TreeNS::Kernel::buildTree(Tree *tree, Particles *particles, integer n, integer m) {
456
457 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
458 integer stride = blockDim.x * gridDim.x;
459
460 //note: -1 used as "null pointer"
461 //note: -2 used to lock a child (pointer)
462
463 integer offset;
464 int level;
465 bool newBody = true;
466
467 real min_x;
468 real max_x;
469 real x;
470#if DIM > 1
471 real y;
472 real min_y;
473 real max_y;
474#if DIM == 3
475 real z;
476 real min_z;
477 real max_z;
478#endif
479#endif
480
481 integer childPath;
482 integer temp;
483
484 offset = 0;
485
486 while ((bodyIndex + offset) < n) {
487
488 if (newBody) {
489
490 newBody = false;
491 level = 0;
492
493 // copy bounding box(es)
494 min_x = *tree->minX;
495 max_x = *tree->maxX;
496 x = particles->x[bodyIndex + offset];
497#if DIM > 1
498 y = particles->y[bodyIndex + offset];
499 min_y = *tree->minY;
500 max_y = *tree->maxY;
501#if DIM == 3
502 z = particles->z[bodyIndex + offset];
503 min_z = *tree->minZ;
504 max_z = *tree->maxZ;
505#endif
506#endif
507 temp = 0;
508 childPath = 0;
509
510 // find insertion point for body
511 // x direction
512 if (x < 0.5 * (min_x + max_x)) { // x direction
513 childPath += 1;
514 max_x = 0.5 * (min_x + max_x);
515 }
516 else {
517 min_x = 0.5 * (min_x + max_x);
518 }
519#if DIM > 1
520 // y direction
521 if (y < 0.5 * (min_y + max_y)) { // y direction
522 childPath += 2;
523 max_y = 0.5 * (min_y + max_y);
524 }
525 else {
526 min_y = 0.5 * (min_y + max_y);
527 }
528#if DIM == 3
529 // z direction
530 if (z < 0.5 * (min_z + max_z)) { // z direction
531 childPath += 4;
532 max_z = 0.5 * (min_z + max_z);
533 }
534 else {
535 min_z = 0.5 * (min_z + max_z);
536 }
537#endif
538#endif
539 }
540
541 integer childIndex = tree->child[temp*POW_DIM + childPath];
542
543 // traverse tree until hitting leaf node
544 while (childIndex >= m) { //n
545
546 temp = childIndex;
547 level++;
548
549 childPath = 0;
550
551 // find insertion point for body
552 if (x < 0.5 * (min_x + max_x)) { // x direction
553 childPath += 1;
554 max_x = 0.5 * (min_x + max_x);
555 }
556 else {
557 min_x = 0.5 * (min_x + max_x);
558 }
559#if DIM > 1
560 if (y < 0.5 * (min_y + max_y)) { // y direction
561 childPath += 2;
562 max_y = 0.5 * (min_y + max_y);
563 }
564 else {
565 min_y = 0.5 * (min_y + max_y);
566 }
567#if DIM == 3
568 if (z < 0.5 * (min_z + max_z)) { // z direction
569 childPath += 4;
570 max_z = 0.5 * (min_z + max_z);
571 }
572 else {
573 min_z = 0.5 * (min_z + max_z);
574 }
575#endif
576#endif
577
578#if COMPUTE_DIRECTLY
579 if (particles->mass[bodyIndex + offset] != 0) {
580 //particles->x[temp] += particles->weightedEntry(bodyIndex + offset, Entry::x);
581 atomicAdd(&particles->x[temp], particles->weightedEntry(bodyIndex + offset, Entry::x));
582#if DIM > 1
583 //particles->y[temp] += particles->weightedEntry(bodyIndex + offset, Entry::y);
584 atomicAdd(&particles->y[temp], particles->weightedEntry(bodyIndex + offset, Entry::y));
585#if DIM == 3
586 //particles->z[temp] += particles->weightedEntry(bodyIndex + offset, Entry::z);
587 atomicAdd(&particles->z[temp], particles->weightedEntry(bodyIndex + offset, Entry::z));
588#endif
589#endif
590 }
591
592 //particles->mass[temp] += particles->mass[bodyIndex + offset];
593 atomicAdd(&particles->mass[temp], particles->mass[bodyIndex + offset]);
594#endif // COMPUTE_DIRECTLY
595
596 atomicAdd(&tree->count[temp], 1);
597 childIndex = tree->child[POW_DIM * temp + childPath];
598 }
599
600 // if child is not locked
601 if (childIndex != -2) {
602
603 integer locked = temp * POW_DIM + childPath;
604
605 if (atomicCAS(&tree->child[locked], childIndex, -2) == childIndex) {
606
607 // check whether a body is already stored at the location
608 if (childIndex == -1) {
609 //insert body and release lock
610 tree->child[locked] = bodyIndex + offset;
611 particles->level[bodyIndex + offset] = level + 1;
612
613 }
614 else {
615 if (childIndex >= n) {
616 printf("ATTENTION!\n");
617 }
618 integer patch = POW_DIM * m; //8*n
619 while (childIndex >= 0 && childIndex < n) { // was n
620
621 //create a new cell (by atomically requesting the next unused array index)
622 integer cell = atomicAdd(tree->index, 1);
623 patch = min(patch, cell);
624
625 if (patch != cell) {
626 tree->child[POW_DIM * temp + childPath] = cell;
627 }
628
629 level++;
630 // ATTENTION: most likely a problem with level counting (level = level - 1)
631 // but could be also a problem regarding maximum tree depth...
632 if (level > (MAX_LEVEL + 1)) {
633#if DIM == 1
634 cudaAssert("buildTree: level = %i for index %i (%e)", level,
635 bodyIndex + offset, particles->x[bodyIndex + offset]);
636#elif DIM == 2
637 cudaAssert("buildTree: level = %i for index %i (%e, %e)", level,
638 bodyIndex + offset, particles->x[bodyIndex + offset],
639 particles->y[bodyIndex + offset]);
640#else
641 cudaAssert("buildTree: level = %i for index %i (%e, %e, %e)", level,
642 bodyIndex + offset, particles->x[bodyIndex + offset],
643 particles->y[bodyIndex + offset],
644 particles->z[bodyIndex + offset]);
645#endif
646 }
647
648 // insert old/original particle
649 childPath = 0;
650 if (particles->x[childIndex] < 0.5 * (min_x + max_x)) { childPath += 1; }
651#if DIM > 1
652 if (particles->y[childIndex] < 0.5 * (min_y + max_y)) { childPath += 2; }
653#if DIM == 3
654 if (particles->z[childIndex] < 0.5 * (min_z + max_z)) { childPath += 4; }
655#endif
656#endif
657#if COMPUTE_DIRECTLY
658 particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
659 //particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
660#if DIM > 1
661 particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
662 //particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
663#if DIM == 3
664 particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
665 //particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
666#endif
667#endif
668
669 //if (cell % 1000 == 0) {
670 // printf("buildTree: x[%i] = (%f, %f, %f) from x[%i] = (%f, %f, %f) m = %f\n", cell, particles->x[cell], particles->y[cell],
671 // particles->z[cell], childIndex, particles->x[childIndex], particles->y[childIndex],
672 // particles->z[childIndex], particles->mass[childIndex]);
673 //}
674
675 particles->mass[cell] += particles->mass[childIndex];
676#else // COMPUTE_DIRECTLY
677 //particles->x[cell] = particles->x[childIndex];
678 particles->x[cell] = 0.5 * (min_x + max_x);
679#if DIM > 1
680 //particles->y[cell] = particles->y[childIndex];
681 particles->y[cell] = 0.5 * (min_y + max_y);
682#if DIM == 3
683 //particles->z[cell] = particles->z[childIndex];
684 particles->z[cell] = 0.5 * (min_z + max_z);
685#endif
686#endif
687
688#endif // COMPUTE_DIRECTLY
689
690 tree->count[cell] += tree->count[childIndex];
691
692 tree->child[POW_DIM * cell + childPath] = childIndex;
693 particles->level[cell] = level;
694 particles->level[childIndex] += 1;
695 tree->start[cell] = -1;
696
697#if DEBUGGING
698 if (particles->level[cell] >= particles->level[childIndex]) {
699 cudaAssert("lvl: %i vs. %i\n", particles->level[cell], particles->level[childIndex]);
700 }
701#endif
702
703 // insert new particle
704 temp = cell;
705 childPath = 0;
706
707 // find insertion point for body
708 //if (particles->x[bodyIndex + offset] < 0.5 * (min_x + max_x)) {
709 if (x < 0.5 * (min_x + max_x)) {
710 childPath += 1;
711 max_x = 0.5 * (min_x + max_x);
712 } else {
713 min_x = 0.5 * (min_x + max_x);
714 }
715#if DIM > 1
716 //if (particles->y[bodyIndex + offset] < 0.5 * (min_y + max_y)) {
717 if (y < 0.5 * (min_y + max_y)) {
718 childPath += 2;
719 max_y = 0.5 * (min_y + max_y);
720 } else {
721 min_y = 0.5 * (min_y + max_y);
722 }
723#if DIM == 3
724 //if (particles->z[bodyIndex + offset] < 0.5 * (min_z + max_z)) {
725 if (z < 0.5 * (min_z + max_z)) {
726 childPath += 4;
727 max_z = 0.5 * (min_z + max_z);
728 } else {
729 min_z = 0.5 * (min_z + max_z);
730 }
731#endif
732#endif
733#if COMPUTE_DIRECTLY
734 // COM / preparing for calculation of COM
735 if (particles->mass[bodyIndex + offset] != 0) {
736 //particles->x[cell] += particles->weightedEntry(bodyIndex + offset, Entry::x);
737 particles->x[cell] += particles->weightedEntry(bodyIndex + offset, Entry::x);
738#if DIM > 1
739 //particles->y[cell] += particles->weightedEntry(bodyIndex + offset, Entry::y);
740 particles->y[cell] += particles->weightedEntry(bodyIndex + offset, Entry::y);
741#if DIM == 3
742 //particles->z[cell] += particles->weightedEntry(bodyIndex + offset, Entry::z);
743 particles->z[cell] += particles->weightedEntry(bodyIndex + offset, Entry::z);
744#endif
745#endif
746 particles->mass[cell] += particles->mass[bodyIndex + offset];
747 }
748#endif // COMPUTE_DIRECTLY
749 tree->count[cell] += tree->count[bodyIndex + offset];
750 childIndex = tree->child[POW_DIM * temp + childPath];
751 }
752
753 tree->child[POW_DIM * temp + childPath] = bodyIndex + offset;
754 particles->level[bodyIndex + offset] = level + 1;
755
756 __threadfence(); // written to global memory arrays (child, x, y, mass) thus need to fence
757 tree->child[locked] = patch;
758 }
759 offset += stride;
760 newBody = true;
761 }
762 }
763 __syncthreads();
764 }
765}
766*/
767
768__global__ void TreeNS::Kernel::buildTree(Tree *tree, Particles *particles, integer n, integer m) {
769
770 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
771 integer stride = blockDim.x * gridDim.x;
772
773 //note: -1 used as "null pointer"
774 //note: -2 used to lock a child (pointer)
775
776 volatile integer *childList = tree->child;
777
778 integer offset;
779 int level;
780 bool newBody = true;
781
782 real min_x;
783 real max_x;
784 real x;
785#if DIM > 1
786 real y;
787 real min_y;
788 real max_y;
789#if DIM == 3
790 real z;
791 real min_z;
792 real max_z;
793#endif
794#endif
795
796 integer childPath;
797 integer temp;
798
799 offset = 0;
800
801 while ((bodyIndex + offset) < n) {
802
803 if (newBody) {
804
805 newBody = false;
806 level = 0;
807
808 // copy bounding box(es)
809 min_x = *tree->minX;
810 max_x = *tree->maxX;
811 x = particles->x[bodyIndex + offset];
812#if DIM > 1
813 y = particles->y[bodyIndex + offset];
814 min_y = *tree->minY;
815 max_y = *tree->maxY;
816#if DIM == 3
817 z = particles->z[bodyIndex + offset];
818 min_z = *tree->minZ;
819 max_z = *tree->maxZ;
820#endif
821#endif
822 temp = 0;
823 childPath = 0;
824
825 // find insertion point for body
826 // x direction
827 if (x < 0.5 * (min_x + max_x)) { // x direction
828 childPath += 1;
829 max_x = 0.5 * (min_x + max_x);
830 }
831 else {
832 min_x = 0.5 * (min_x + max_x);
833 }
834#if DIM > 1
835 // y direction
836 if (y < 0.5 * (min_y + max_y)) { // y direction
837 childPath += 2;
838 max_y = 0.5 * (min_y + max_y);
839 }
840 else {
841 min_y = 0.5 * (min_y + max_y);
842 }
843#if DIM == 3
844 // z direction
845 if (z < 0.5 * (min_z + max_z)) { // z direction
846 childPath += 4;
847 max_z = 0.5 * (min_z + max_z);
848 }
849 else {
850 min_z = 0.5 * (min_z + max_z);
851 }
852#endif
853#endif
854 }
855
856 register integer childIndex = childList[temp*POW_DIM + childPath];
857
858 // traverse tree until hitting leaf node
859 while (childIndex >= m) { //n
860
861 temp = childIndex;
862 level++;
863
864 childPath = 0;
865
866 // find insertion point for body
867 if (x < 0.5 * (min_x + max_x)) { // x direction
868 childPath += 1;
869 max_x = 0.5 * (min_x + max_x);
870 }
871 else {
872 min_x = 0.5 * (min_x + max_x);
873 }
874#if DIM > 1
875 if (y < 0.5 * (min_y + max_y)) { // y direction
876 childPath += 2;
877 max_y = 0.5 * (min_y + max_y);
878 }
879 else {
880 min_y = 0.5 * (min_y + max_y);
881 }
882#if DIM == 3
883 if (z < 0.5 * (min_z + max_z)) { // z direction
884 childPath += 4;
885 max_z = 0.5 * (min_z + max_z);
886 }
887 else {
888 min_z = 0.5 * (min_z + max_z);
889 }
890#endif
891#endif
892
893#if COMPUTE_DIRECTLY
894 if (particles->mass[bodyIndex + offset] != 0) {
895 //particles->x[temp] += particles->weightedEntry(bodyIndex + offset, Entry::x);
896 atomicAdd(&particles->x[temp], particles->weightedEntry(bodyIndex + offset, Entry::x));
897#if DIM > 1
898 //particles->y[temp] += particles->weightedEntry(bodyIndex + offset, Entry::y);
899 atomicAdd(&particles->y[temp], particles->weightedEntry(bodyIndex + offset, Entry::y));
900#if DIM == 3
901 //particles->z[temp] += particles->weightedEntry(bodyIndex + offset, Entry::z);
902 atomicAdd(&particles->z[temp], particles->weightedEntry(bodyIndex + offset, Entry::z));
903#endif
904#endif
905 }
906
907 //particles->mass[temp] += particles->mass[bodyIndex + offset];
908 atomicAdd(&particles->mass[temp], particles->mass[bodyIndex + offset]);
909#endif // COMPUTE_DIRECTLY
910
911 atomicAdd(&tree->count[temp], 1);
912 childIndex = childList[POW_DIM * temp + childPath];
913 }
914
915 __syncthreads();
916
917 // if child is not locked
918 if (childIndex != -2) {
919
920 integer locked = temp * POW_DIM + childPath;
921
922 if (atomicCAS((int *) &childList[locked], childIndex, -2) == childIndex) {
923
924 // check whether a body is already stored at the location
925 if (childIndex == -1) {
926 //insert body and release lock
927 childList[locked] = bodyIndex + offset;
928 particles->level[bodyIndex + offset] = level + 1;
929
930 }
931 else {
932 if (childIndex >= n) {
933 printf("ATTENTION!\n");
934 }
935 integer patch = POW_DIM * m; //8*n
936 while (childIndex >= 0 && childIndex < n) { // was n
937
938 //create a new cell (by atomically requesting the next unused array index)
939 integer cell = atomicAdd(tree->index, 1);
940 patch = min(patch, cell);
941
942 if (patch != cell) {
943 childList[POW_DIM * temp + childPath] = cell;
944 }
945
946 level++;
947 // ATTENTION: most likely a problem with level counting (level = level - 1)
948 // but could be also a problem regarding maximum tree depth...
949 if (level > (MAX_LEVEL + 1)) {
950#if DIM == 1
951 cudaAssert("buildTree: level = %i for index %i (%e)", level,
952 bodyIndex + offset, particles->x[bodyIndex + offset]);
953#elif DIM == 2
954 cudaAssert("buildTree: level = %i for index %i (%e, %e)", level,
955 bodyIndex + offset, particles->x[bodyIndex + offset],
956 particles->y[bodyIndex + offset]);
957#else
958 cudaAssert("buildTree: level = %i for index %i (%e, %e, %e)", level,
959 bodyIndex + offset, particles->x[bodyIndex + offset],
960 particles->y[bodyIndex + offset],
961 particles->z[bodyIndex + offset]);
962#endif
963 }
964
965 // insert old/original particle
966 childPath = 0;
967 if (particles->x[childIndex] < 0.5 * (min_x + max_x)) { childPath += 1; }
968#if DIM > 1
969 if (particles->y[childIndex] < 0.5 * (min_y + max_y)) { childPath += 2; }
970#if DIM == 3
971 if (particles->z[childIndex] < 0.5 * (min_z + max_z)) { childPath += 4; }
972#endif
973#endif
974#if COMPUTE_DIRECTLY
975 particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
976 //particles->x[cell] += particles->weightedEntry(childIndex, Entry::x);
977#if DIM > 1
978 particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
979 //particles->y[cell] += particles->weightedEntry(childIndex, Entry::y);
980#if DIM == 3
981 particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
982 //particles->z[cell] += particles->weightedEntry(childIndex, Entry::z);
983#endif
984#endif
985
986 //if (cell % 1000 == 0) {
987 // printf("buildTree: x[%i] = (%f, %f, %f) from x[%i] = (%f, %f, %f) m = %f\n", cell, particles->x[cell], particles->y[cell],
988 // particles->z[cell], childIndex, particles->x[childIndex], particles->y[childIndex],
989 // particles->z[childIndex], particles->mass[childIndex]);
990 //}
991
992 particles->mass[cell] += particles->mass[childIndex];
993#else // COMPUTE_DIRECTLY
994 //particles->x[cell] = particles->x[childIndex];
995 particles->x[cell] = 0.5 * (min_x + max_x);
996#if DIM > 1
997 //particles->y[cell] = particles->y[childIndex];
998 particles->y[cell] = 0.5 * (min_y + max_y);
999#if DIM == 3
1000 //particles->z[cell] = particles->z[childIndex];
1001 particles->z[cell] = 0.5 * (min_z + max_z);
1002#endif
1003#endif
1004
1005#endif // COMPUTE_DIRECTLY
1006
1007 tree->count[cell] += tree->count[childIndex];
1008
1009 childList[POW_DIM * cell + childPath] = childIndex;
1010 particles->level[cell] = level;
1011 particles->level[childIndex] += 1;
1012 tree->start[cell] = -1;
1013
1014#if DEBUGGING
1015 if (particles->level[cell] >= particles->level[childIndex]) {
1016 cudaAssert("lvl: %i vs. %i\n", particles->level[cell], particles->level[childIndex]);
1017 }
1018#endif
1019
1020 // insert new particle
1021 temp = cell;
1022 childPath = 0;
1023
1024 // find insertion point for body
1025 //if (particles->x[bodyIndex + offset] < 0.5 * (min_x + max_x)) {
1026 if (x < 0.5 * (min_x + max_x)) {
1027 childPath += 1;
1028 max_x = 0.5 * (min_x + max_x);
1029 } else {
1030 min_x = 0.5 * (min_x + max_x);
1031 }
1032#if DIM > 1
1033 //if (particles->y[bodyIndex + offset] < 0.5 * (min_y + max_y)) {
1034 if (y < 0.5 * (min_y + max_y)) {
1035 childPath += 2;
1036 max_y = 0.5 * (min_y + max_y);
1037 } else {
1038 min_y = 0.5 * (min_y + max_y);
1039 }
1040#if DIM == 3
1041 //if (particles->z[bodyIndex + offset] < 0.5 * (min_z + max_z)) {
1042 if (z < 0.5 * (min_z + max_z)) {
1043 childPath += 4;
1044 max_z = 0.5 * (min_z + max_z);
1045 } else {
1046 min_z = 0.5 * (min_z + max_z);
1047 }
1048#endif
1049#endif
1050#if COMPUTE_DIRECTLY
1051 // COM / preparing for calculation of COM
1052 if (particles->mass[bodyIndex + offset] != 0) {
1053 //particles->x[cell] += particles->weightedEntry(bodyIndex + offset, Entry::x);
1054 particles->x[cell] += particles->weightedEntry(bodyIndex + offset, Entry::x);
1055#if DIM > 1
1056 //particles->y[cell] += particles->weightedEntry(bodyIndex + offset, Entry::y);
1057 particles->y[cell] += particles->weightedEntry(bodyIndex + offset, Entry::y);
1058#if DIM == 3
1059 //particles->z[cell] += particles->weightedEntry(bodyIndex + offset, Entry::z);
1060 particles->z[cell] += particles->weightedEntry(bodyIndex + offset, Entry::z);
1061#endif
1062#endif
1063 particles->mass[cell] += particles->mass[bodyIndex + offset];
1064 }
1065#endif // COMPUTE_DIRECTLY
1066 tree->count[cell] += tree->count[bodyIndex + offset];
1067 childIndex = childList[POW_DIM * temp + childPath];
1068 }
1069
1070 childList[POW_DIM * temp + childPath] = bodyIndex + offset;
1071 particles->level[bodyIndex + offset] = level + 1;
1072
1073 __threadfence(); // written to global memory arrays (child, x, y, mass) thus need to fence
1074 childList[locked] = patch;
1075 }
1076 offset += stride;
1077 newBody = true;
1078 }
1079 }
1080 __syncthreads();
1081 }
1082}
1083
1084/*
1085__global__ void TreeNS::Kernel::buildTree(Tree *tree, Particles *particles, integer n, integer m)
1086{
1087 register int inc = blockDim.x * gridDim.x;
1088 register int i = threadIdx.x + blockIdx.x * blockDim.x;
1089 register int k;
1090 register int childIndex, child;
1091 register int lockedIndex;
1092 register double x;
1093 register double min_x, max_x;
1094#if DIM > 1
1095 register double y;
1096 register double min_y, max_y;
1097#endif
1098 register double r;
1099 register double dx;
1100#if DIM > 1
1101 register double dy;
1102#endif
1103 register double rootX = 0.5 * (*tree->maxX + *tree->minX);
1104 register double rootRadius = 0.5 * (*tree->maxX - *tree->minX);
1105#if DIM > 1
1106 register double rootY = 0.5 * (*tree->maxY + *tree->minY);
1107 rootRadius = cuda::math::max(rootRadius, 0.5 * (*tree->maxY - *tree->minY));
1108#endif
1109 register int depth = 0;
1110 register bool isNewParticle = true;
1111 register int currentNodeIndex;
1112 register int newNodeIndex;
1113 register int subtreeNodeIndex;
1114#if DIM == 3
1115 register double z;
1116 register double min_z, max_z;
1117 register double dz;
1118 register double rootZ = 0.5 * (*tree->maxZ + *tree->minZ);
1119 rootRadius = cuda::math::max(rootRadius, 0.5 * (*tree->maxZ - *tree->minZ));
1120#endif
1121
1122 volatile double *px, *pm;
1123#if DIM > 1
1124 volatile double *py;
1125#if DIM == 3
1126 volatile double *pz;
1127#endif
1128#endif
1129
1130 px = particles->x;
1131 pm = particles->mass;
1132#if DIM > 1
1133 py = particles->y;
1134#if DIM == 3
1135 pz = particles->z;
1136#endif
1137#endif
1138
1139 while (i < n) {
1140 depth = 0;
1141
1142 if (isNewParticle) {
1143 isNewParticle = false;
1144 // cache particle data
1145 x = px[i];
1146 min_x = *tree->minX;
1147 max_x = *tree->maxX;
1148 //p.ax[i] = 0.0;
1149#if DIM > 1
1150 y = py[i];
1151 min_y = *tree->minY;
1152 max_y = *tree->maxY;
1153 //p.ay[i] = 0.0;
1154#if DIM == 3
1155 z = pz[i];
1156 min_z = *tree->minZ;
1157 max_z = *tree->maxZ;
1158 //p.az[i] = 0.0;
1159#endif
1160#endif
1161
1162 // start at root
1163 currentNodeIndex = 0;
1164 r = rootRadius;
1165 childIndex = 0;
1166
1167// if (x > rootX) childIndex = 1;
1168//#if DIM > 1
1169// if (y > rootY) childIndex += 2;
1170//#if DIM == 3
1171// if (z > rootZ) childIndex += 4;
1172//#endif
1173//#endif
1174
1175
1176 // find insertion point for body
1177 // x direction
1178 if (x < 0.5 * (min_x + max_x)) { // x direction
1179 childIndex += 1;
1180 max_x = 0.5 * (min_x + max_x);
1181 }
1182 else {
1183 min_x = 0.5 * (min_x + max_x);
1184 }
1185#if DIM > 1
1186 // y direction
1187 if (y < 0.5 * (min_y + max_y)) { // y direction
1188 childIndex += 2;
1189 max_y = 0.5 * (min_y + max_y);
1190 }
1191 else {
1192 min_y = 0.5 * (min_y + max_y);
1193 }
1194#if DIM == 3
1195 // z direction
1196 if (z < 0.5 * (min_z + max_z)) { // z direction
1197 childIndex += 4;
1198 max_z = 0.5 * (min_z + max_z);
1199 }
1200 else {
1201 min_z = 0.5 * (min_z + max_z);
1202 }
1203#endif
1204#endif
1205
1206 }
1207
1208 // follow path to leaf
1209 child = tree->child[POW_DIM * currentNodeIndex + childIndex]; //childList[childListIndex(currentNodeIndex, childIndex)];
1210 // leaves are 0 ... numParticles
1211 while (child >= m) {
1212 currentNodeIndex = child;
1213 depth++;
1214 r *= 0.5;
1215 // which child?
1216 childIndex = 0;
1217
1218// if (x > px[currentNodeIndex]) childIndex = 1;
1219//#if DIM > 1
1220// if (y > py[currentNodeIndex]) childIndex += 2;
1221//#if DIM > 2
1222// if (z > pz[currentNodeIndex]) childIndex += 4;
1223//#endif
1224//#endif
1225
1226 // find insertion point for body
1227 if (x < 0.5 * (min_x + max_x)) { // x direction
1228 childIndex += 1;
1229 max_x = 0.5 * (min_x + max_x);
1230 }
1231 else {
1232 min_x = 0.5 * (min_x + max_x);
1233 }
1234#if DIM > 1
1235 if (y < 0.5 * (min_y + max_y)) { // y direction
1236 childIndex += 2;
1237 max_y = 0.5 * (min_y + max_y);
1238 }
1239 else {
1240 min_y = 0.5 * (min_y + max_y);
1241 }
1242#if DIM == 3
1243 if (z < 0.5 * (min_z + max_z)) { // z direction
1244 childIndex += 4;
1245 max_z = 0.5 * (min_z + max_z);
1246 }
1247 else {
1248 min_z = 0.5 * (min_z + max_z);
1249 }
1250#endif
1251#endif
1252
1253 child = tree->child[POW_DIM * currentNodeIndex + childIndex]; //childList[childListIndex(currentNodeIndex, childIndex)];
1254 }
1255
1256 // we want to insert the current particle i into currentNodeIndex's child at position childIndex
1257 // where child is now empty, locked or a particle
1258 // if empty -> simply insert, if particle -> create new subtree
1259 if (child != -2) {
1260 // the position where we want to place the particle gets locked
1261 lockedIndex = tree->child[POW_DIM * currentNodeIndex + childIndex]; //childListIndex(currentNodeIndex, childIndex);
1262 // atomic compare and save: compare if child is still the current value of childlist at the index lockedIndex, if so, lock it
1263 // atomicCAS returns the old value of child
1264 if (child == atomicCAS(&tree->child[lockedIndex], child, -2)) { //&childList[lockedIndex]
1265 // if the destination is empty, insert particle
1266 if (child == -1) {
1267 // insert the particle into this leaf
1268 tree->child[lockedIndex] = i; //childList[lockedIndex] = i;
1269 } else {
1270 // there is already a particle, create new inner node
1271 subtreeNodeIndex = POW_DIM * m;
1272 do {
1273 // get the next free nodeIndex
1274 newNodeIndex = atomicAdd(tree->index, 1); //atomicSub((int * ) &maxNodeIndex, 1) - 1;
1275
1276 // throw error if there aren't enough node indices available
1277 //if (newNodeIndex > m) {
1278 //printf("(thread %d): error during tree creation: not enough nodes. newNodeIndex %d, maxNodeIndex %d, numParticles: %d\n", threadIdx.x, newNodeIndex, maxNodeIndex, numParticles);
1279 //assert(0);
1280 //}
1281
1282 // the first available free nodeIndex will be the subtree node
1283 subtreeNodeIndex = min(subtreeNodeIndex, newNodeIndex);
1284
1285 dx = (childIndex & 1) * r;
1286#if DIM > 1
1287 dy = ((childIndex >> 1) & 1) * r;
1288#if DIM == 3
1289 dz = ((childIndex >> 2) & 1) * r;
1290#endif
1291#endif
1292 depth++;
1293 r *= 0.5;
1294
1295 // we save the radius here, so we can use it during neighboursearch. we have to set it to EMPTY after the neighboursearch
1296 pm[newNodeIndex] = r;
1297
1298// dx = px[newNodeIndex] = px[currentNodeIndex] - r + dx;
1299//#if DIM > 1
1300// dy = py[newNodeIndex] = py[currentNodeIndex] - r + dy;
1301//#if DIM == 3
1302// dz = pz[newNodeIndex] = pz[currentNodeIndex] - r + dz;
1303//#endif
1304//#endif
1305
1306
1307 dx = px[newNodeIndex] = (0.5 * (min_x + max_x)) - r + dx;
1308#if DIM > 1
1309 dy = py[newNodeIndex] = (0.5 * (min_y + max_y)) - r + dy;
1310#if DIM == 3
1311 dz = pz[newNodeIndex] = (0.5 * (min_z + max_z)) - r + dz;
1312#endif
1313#endif
1314
1315 //for (k = 0; k < POW_DIM; k++) {
1316 // tree->child[POW_DIM * newNodeIndex + k] = -1; //childList[childListIndex(newNodeIndex, k)] = EMPTY;
1317 //}
1318
1319 if (subtreeNodeIndex != newNodeIndex) {
1320 // this condition is true when the two particles are so close to each other, that they are
1321 // again put into the same node, so we have to create another new inner node.
1322 // in this case, currentNodeIndex is the previous newNodeIndex
1323 // and childIndex is the place where the particle i belongs to, relative to the previous newNodeIndex
1324 tree->child[POW_DIM * currentNodeIndex + childIndex] = newNodeIndex; //childList[childListIndex(currentNodeIndex, childIndex)] = newNodeIndex;
1325 }
1326
1327 childIndex = 0;
1328
1329// if (px[child] > dx) childIndex = 1;
1330//#if DIM > 1
1331// if (py[child] > dy) childIndex += 2;
1332//#if DIM == 3
1333// if (pz[child] > dz) childIndex += 4;
1334//#endif
1335//#endif
1336
1337 //if (particles->x[bodyIndex + offset] < 0.5 * (min_x + max_x)) {
1338 if (px[child] < 0.5 * (min_x + max_x)) {
1339 childIndex += 1;
1340 max_x = 0.5 * (min_x + max_x);
1341 } else {
1342 min_x = 0.5 * (min_x + max_x);
1343 }
1344#if DIM > 1
1345 //if (particles->y[bodyIndex + offset] < 0.5 * (min_y + max_y)) {
1346 if (py[child] < 0.5 * (min_y + max_y)) {
1347 childIndex += 2;
1348 max_y = 0.5 * (min_y + max_y);
1349 } else {
1350 min_y = 0.5 * (min_y + max_y);
1351 }
1352#if DIM == 3
1353 //if (particles->z[bodyIndex + offset] < 0.5 * (min_z + max_z)) {
1354 if (pz[child] < 0.5 * (min_z + max_z)) {
1355 childIndex += 4;
1356 max_z = 0.5 * (min_z + max_z);
1357 } else {
1358 min_z = 0.5 * (min_z + max_z);
1359 }
1360#endif
1361#endif
1362 tree->child[POW_DIM * newNodeIndex + childIndex] = child; //childList[childListIndex(newNodeIndex, childIndex)] = child;
1363
1364 // compare positions of particle i to the new node
1365 currentNodeIndex = newNodeIndex;
1366 childIndex = 0;
1367
1368// if (x > dx) childIndex = 1;
1369//#if DIM > 1
1370// if (y > dy) childIndex += 2;
1371//#if DIM == 3
1372// if (z > dz) childIndex += 4;
1373//#endif
1374//#endif
1375
1376 if (x < 0.5 * (min_x + max_x)) { childIndex += 1; }
1377#if DIM > 1
1378 if (y < 0.5 * (min_y + max_y)) { childIndex += 2; }
1379#if DIM == 3
1380 if (z < 0.5 * (min_z + max_z)) { childIndex += 4; }
1381#endif
1382#endif
1383 child = tree->child[POW_DIM * currentNodeIndex + childIndex]; //child = childList[childListIndex(currentNodeIndex, childIndex)];
1384 // continue creating new nodes (with half radius each) until the other particle is not in the same spot in the tree
1385 } while (child >= 0 && child < n);
1386
1387 tree->child[POW_DIM * currentNodeIndex + childIndex] = i; //childList[childListIndex(currentNodeIndex, childIndex)] = i;
1388 __threadfence();
1389 //__threadfence() is used to halt the current thread until all previous writes to shared and global memory are visible
1390 // by other threads. It does not halt nor affect the position of other threads though!
1391 tree->child[lockedIndex] = subtreeNodeIndex; //childList[lockedIndex] = subtreeNodeIndex;
1392 }
1393 //p.depth[i] = depth;
1394 // continue with next particle
1395 i += inc;
1396 isNewParticle = true;
1397 }
1398 }
1399 __syncthreads(); // child was locked, wait for other threads to unlock
1400 }
1401}
1402*/
1403
1404__global__ void TreeNS::Kernel::prepareSorting(Tree *tree, Particles *particles, integer n, integer m) {
1405 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1406 int stride = blockDim.x * gridDim.x;
1407 int offset = 0;
1408
1409 while ((bodyIndex + offset) < n) {
1410 tree->start[bodyIndex + offset] = bodyIndex + offset;
1411 offset += stride;
1412 }
1413}
1414
1415
1416__global__ void TreeNS::Kernel::calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level) {
1417
1418 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1419 int stride = blockDim.x * gridDim.x;
1420
1421 int offset = n;
1422
1423 register int i, index;
1424 register real mass;
1425 register real x;
1426#if DIM > 1
1427 register real y;
1428#if DIM == 3
1429 register real z;
1430#endif
1431#endif
1432
1433 while ((bodyIndex + offset) < *tree->index) {
1434
1435 i = bodyIndex + offset;
1436
1437 if (particles->level[i] == level) {
1438
1439 //if (particles->level[bodyIndex + offset] == -1 || particles->level[bodyIndex + offset] > 21) {
1440 // printf("level[%i] = %i!!!\n", bodyIndex + offset, particles->level[bodyIndex + offset]);
1441 // //assert(0);
1442 //}
1443
1444 // reset entries
1445 mass = 0.; //particles->mass[i] = 0.;
1446 x = 0.; //particles->x[i] = 0.;
1447#if DIM > 1
1448 y = 0.; //particles->y[i] = 0.;
1449#if DIM == 3
1450 z = 0.; //particles->z[i] = 0.;
1451#endif
1452#endif
1453
1454 // loop over children and add contribution (*=position(child) * mass(child))
1455 #pragma unroll
1456 for (int child = 0; child < POW_DIM; ++child) {
1457 index = POW_DIM * i + child;
1458 if (tree->child[index] != -1) {
1459 x += particles->weightedEntry(tree->child[index], Entry::x);
1460#if DIM > 1
1461 y += particles->weightedEntry(tree->child[index], Entry::y);
1462#if DIM == 3
1463 z += particles->weightedEntry(tree->child[index], Entry::z);
1464#endif
1465#endif
1466 mass += particles->mass[tree->child[index]];
1467 }
1468 }
1469
1470 // finish center of mass calculation by dividing with mass
1471 if (mass > 0.) { //particles->mass[i]
1472 //particles->x[i] /= particles->mass[i];
1473 x /= mass;
1474#if DIM > 1
1475 //particles->y[i] /= particles->mass[i];
1476 y /= mass;
1477#if DIM == 3
1478 //particles->z[i] /= particles->mass[i];
1479 z /= mass;
1480#endif
1481#endif
1482 }
1483
1484 particles->mass[i] = mass;
1485 particles->x[i] = x;
1486#if DIM > 1
1487 particles->y[i] = y;
1488#if DIM == 3
1489 particles->z[i] = z;
1490#endif
1491#endif
1492
1493 }
1494 offset += stride;
1495 }
1496
1497}
1498
1499// TODO: not needed anymore and ATTENTION! requires preparation within buildTree (using atomicAdd)
1500// therefore moved to calculateCenterOfMass(), since atomicOperations for 64-bit values VERY expensive
1501__global__ void TreeNS::Kernel::centerOfMass(Tree *tree, Particles *particles, integer n) {
1502
1503 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1504 integer stride = blockDim.x*gridDim.x;
1505 integer offset = 0;
1506
1507 //note: most of it already done within buildTreeKernel
1508 bodyIndex += n;
1509
1510 while (bodyIndex + offset < *tree->index) {
1511
1512 if (particles->mass[bodyIndex + offset] != 0) {
1513 particles->x[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1514#if DIM > 1
1515 particles->y[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1516#if DIM == 3
1517 particles->z[bodyIndex + offset] /= particles->mass[bodyIndex + offset];
1518#endif
1519#endif
1520 }
1521
1522 offset += stride;
1523 }
1524}
1525
1526// TODO: not working properly right now (only working for small number of particles)
1527__global__ void TreeNS::Kernel::sort(Tree *tree, integer n, integer m) {
1528
1529 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1530 integer stride = blockDim.x * gridDim.x;
1531 integer offset = 0;
1532
1533 integer s = 0;
1534 if (threadIdx.x == 0) {
1535
1536 for (integer i=0; i<POW_DIM; i++){
1537 integer node = tree->child[i];
1538 if (node >= m) { //n // not a leaf node
1539 tree->start[node] = s;
1540 s += tree->count[node];
1541 }
1542 else if (node >= 0) { // leaf node
1543 tree->sorted[s] = node;
1544 s++;
1545 }
1546 //if (bodyIndex == 0) {
1547 // printf("i = %i, start[%i] = %i (m = %i)\n", i, node, tree->start[node], m);
1548 //}
1549 }
1550 }
1551
1552 //__threadfence();
1553 //__syncthreads();
1554
1555 integer cell = m + bodyIndex;
1556 integer ind = *tree->index;
1557 //integer ind = tree->toDeleteNode[1];
1558
1559 //int counter = 0;
1560 while ((cell + offset) < ind /*&& counter <= 500*/) {
1561
1562 //if ((cell + offset) < (m + POW_DIM)) {
1563 // for (int i=0; i<POW_DIM; i++) {
1564 // printf("cell + offset = %i, start = %i, child = %i, start = %i, stride = %i\n", cell + offset, tree->start[cell + offset],
1565 // tree->child[POW_DIM * (cell + offset) + i], tree->start[cell + offset], stride);
1566 // }
1567 //}
1568
1569 s = tree->start[cell + offset];
1570 //counter += 1;
1571
1572 //if (counter >= 500 /*&& s >= 0*/) {
1573 // for (int i=0; i<POW_DIM; i++) {
1574 // printf("counter: %i, cell + offset = %i, start = %i, child = %i, count = %i\n", counter, cell + offset, s,
1575 // tree->child[POW_DIM * (cell + offset) + i], tree->count[tree->child[POW_DIM * (cell + offset) + i]]);
1576 // }
1577 //}
1578
1579 if (s >= 0) {
1580
1581 for (integer i_child=0; i_child<POW_DIM; i_child++) {
1582 integer node = tree->child[POW_DIM*(cell+offset) + i_child];
1583 if (node >= m) { //m // not a leaf node
1584 tree->start[node] = s;
1585 s += tree->count[node];
1586 //if (tree->count[node] >= 0) {
1587 // s += tree->count[node];
1588 //}
1589 //else {
1590 // printf("+= %i\n", tree->count[node]);
1591 //}
1592 }
1593 else if (node >= 0) { // leaf node
1594 tree->sorted[s] = node;
1595 s++;
1596 }
1597 }
1598 //if (counter >= 0) {
1599 // offset -= counter * stride;
1600 // counter = 0;
1601 //}
1602 //else {
1603 // offset += stride;
1604 //}
1605 // //counter = 0;
1606 offset += stride;
1607 }
1608 //else {
1609 // printf("ATTENTION: s = %i for %i\n", s, cell + offset);
1610 // offset += stride;
1611 // counter++;
1612 // //break;
1613 //}
1614
1615 }
1616}
1617
1618__global__ void TreeNS::Kernel::getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel,
1619 integer n, Curve::Type curveType) {
1620
1621 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1622 integer stride = blockDim.x * gridDim.x;
1623 integer offset = 0;
1624
1625 keyType particleKey;
1626
1627 while (bodyIndex + offset < n) {
1628
1629 particleKey = tree->getParticleKey(particles, bodyIndex + offset, maxLevel, curveType);
1630#if DEBUGGING
1631#if DIM == 3
1632 if (particleKey == 1UL) {
1633 printf("particleKey = %lu (%f, %f, %f)\n", particleKey, particles->x[bodyIndex + offset],
1634 particles->y[bodyIndex + offset], particles->z[bodyIndex + offset]);
1635 }
1636#endif
1637#endif
1638 keys[bodyIndex + offset] = particleKey;
1639
1640 offset += stride;
1641 }
1642
1643}
1644
1645__global__ void TreeNS::Kernel::globalCOM(Tree *tree, Particles *particles, real com[DIM]) {
1646
1647 real mass = 0;
1648 for (int i=0; i<DIM; i++) {
1649 com[i] = 0;
1650 }
1651 for (int i=0; i<POW_DIM; i++) {
1652 if (tree->child[i] != -1) {
1653 mass += particles->mass[tree->child[i]];
1654 com[0] += particles->weightedEntry(tree->child[i], Entry::x);
1655#if DIM > 1
1656 com[1] += particles->weightedEntry(tree->child[i], Entry::y);
1657#if DIM == 3
1658 com[2] += particles->weightedEntry(tree->child[i], Entry::z);
1659#endif
1660#endif
1661 }
1662 }
1663 if (mass > 0) {
1664 com[0] /= mass;
1665#if DIM > 1
1666 com[1] /= mass;
1667#if DIM == 3
1668 com[2] /= mass;
1669#endif
1670#endif
1671 }
1672
1673}
1674
1675namespace TreeNS {
1676
1677 namespace Kernel {
1678
1679 __global__ void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1680 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX) {
1681 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX);
1682 }
1683
1684 __global__ void info(Tree *tree, Particles *particles, integer n, integer m) {
1685 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1686 integer stride = blockDim.x * gridDim.x;
1687 integer offset = 0;
1688
1689 int relevantChild = 0;
1690 int childIndex, temp;
1691
1692 while ((bodyIndex + offset) < POW_DIM) {
1693 childIndex = tree->child[bodyIndex + offset];
1694 temp = childIndex;
1695 while (childIndex != -1) {
1696 childIndex = tree->child[POW_DIM * childIndex + relevantChild];
1697
1698 if (childIndex != -1) {
1699 if (particles->level[temp] >= particles->level[childIndex]) {
1700 cudaAssert("level[%i]: %i vs. level[%i]: %i\n", temp, particles->level[temp], childIndex,
1701 particles->level[childIndex]);
1702 }
1703 }
1704
1705 temp = childIndex;
1706 }
1707
1708 offset += stride;
1709 }
1710
1711 //while ((bodyIndex + offset) < n) {
1712 // if (particles->level[bodyIndex + offset] < 0) {
1713 // printf("attention: level[%i] = %i\n", bodyIndex + offset,
1714 // particles->level[bodyIndex + offset]);
1715 // assert(0);
1716 // }
1717 // offset += stride;
1718 //}
1719 //offset = m;
1720 //while ((bodyIndex + offset) < *tree->index) {
1721 // if (particles->level[bodyIndex + offset] < 0 && particles->mass[bodyIndex + offset] > 1e-4) {
1722 //#if DIM == 3
1723 // printf("attention: level[%i] = %i (%e, %e, %e) %e\n", bodyIndex + offset,
1724 // particles->level[bodyIndex + offset],
1725 // particles->x[bodyIndex + offset],
1726 // particles->y[bodyIndex + offset],
1727 // particles->z[bodyIndex + offset],
1728 // particles->mass[bodyIndex + offset]);
1729 //#endif
1730 // assert(0);
1731 // }
1732 // offset += stride;
1733 //}
1734
1735 //offset = tree->toDeleteLeaf[0];
1736 //while ((bodyIndex + offset) < n && (bodyIndex + offset)) {
1737 // for (int i=0; i<POW_DIM; i++) {
1738 // if (tree->child[POW_DIM * (bodyIndex + offset) + i] != -1) {
1739 // printf("tree->child[POW_DIM * %i + %i] = %i\n", bodyIndex + offset, i, tree->child[POW_DIM * (bodyIndex + offset) + i]);
1740 // assert(0);
1741 // }
1742 // }
1743 // offset += stride;
1744 //}
1745 //offset = tree->toDeleteNode[0];
1746 //while ((bodyIndex + offset) < m && (bodyIndex + offset)) {
1747 // for (int i=0; i<POW_DIM; i++) {
1748 // if (tree->child[POW_DIM * (bodyIndex + offset) + i] != -1) {
1749 // printf("tree->child[POW_DIM * %i + %i] = %i\n", bodyIndex + offset, i, tree->child[POW_DIM * (bodyIndex + offset) + i]);
1750 // assert(0);
1751 // }
1752 // }
1753 // offset += stride;
1754 //}
1755
1756 //while (bodyIndex + offset < n) {
1757 // if ((bodyIndex + offset) % 10000 == 0) {
1758 // printf("tree info\n");
1759 // }
1760 // offset += stride;
1761 //}
1762
1763 //bodyIndex += n;
1764 //while (bodyIndex + offset < m) {
1765
1766 //printf("particles->mass[%i] = %f (%f, %f, %f) (%i, %i)\n", bodyIndex + offset,
1767 // particles->mass[bodyIndex + offset],
1768 // particles->x[bodyIndex + offset],
1769 // particles->y[bodyIndex + offset],
1770 // particles->z[bodyIndex + offset], n, m);
1771
1772 //printf("x[%i] = (%f, %f, %f) mass = %f\n", bodyIndex + offset, particles->x[bodyIndex + offset],
1773 // particles->y[bodyIndex + offset], particles->z[bodyIndex + offset],
1774 // particles->mass[bodyIndex + offset]);
1775 //#if DIM == 1
1776 //printf("(%f), \n", particles->x[bodyIndex + offset]);
1777 //#elif DIM == 2
1778 //printf("(%f, %f), \n", particles->x[bodyIndex + offset],
1779 // particles->y[bodyIndex + offset]);
1780 //#else
1781 //printf("(%f, %f, %f), \n", particles->x[bodyIndex + offset],
1782 // particles->y[bodyIndex + offset], particles->z[bodyIndex + offset]);
1783 //#endif
1784 //offset += stride;
1785 //}
1786 }
1787
1788 __global__ void info(Tree *tree, Particles *particles) {
1789
1790 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1791 integer stride = blockDim.x * gridDim.x;
1792 integer offset = 0;
1793
1794 while (bodyIndex + offset < POW_DIM) {
1795#if DIM == 3
1796 printf("child[POW_DIM * 0 + %i] = %i, x = (%f, %f, %f) m = %f\n", bodyIndex + offset,
1797 tree->child[bodyIndex + offset], particles->x[tree->child[bodyIndex + offset]],
1798 particles->y[tree->child[bodyIndex + offset]], particles->z[tree->child[bodyIndex + offset]],
1799 particles->mass[tree->child[bodyIndex + offset]]);
1800
1801 for (int i=0; i<POW_DIM; i++) {
1802 printf("child[POW_DIM * %i + %i] = %i, x = (%f, %f, %f) m = %f\n", tree->child[bodyIndex + offset], i,
1803 tree->child[POW_DIM * tree->child[bodyIndex + offset] + i],
1804 particles->x[tree->child[POW_DIM * tree->child[bodyIndex + offset] + i]],
1805 particles->y[tree->child[POW_DIM * tree->child[bodyIndex + offset] + i]],
1806 particles->z[tree->child[POW_DIM * tree->child[bodyIndex + offset] + i]],
1807 particles->mass[tree->child[POW_DIM * tree->child[bodyIndex + offset] + i]]);
1808 }
1809#endif
1810
1811 offset += stride;
1812 }
1813 }
1814
1815 __global__ void testTree(Tree *tree, Particles *particles, integer n, integer m) {
1816
1817 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1818 integer stride = blockDim.x * gridDim.x;
1819 integer offset = 0;
1820
1821 real mass;
1822 real masses[POW_DIM];
1823
1824 while (bodyIndex + offset < POW_DIM) {
1825
1826 mass = 0;
1827
1828 for (int i=0; i<POW_DIM; i++) {
1829 masses[i] = 0;
1830 if (tree->child[POW_DIM * tree->child[bodyIndex + offset] + i] != -1) {
1831 masses[i] = particles->mass[tree->child[POW_DIM * tree->child[bodyIndex + offset] + i]];
1832 mass += masses[i];
1833 }
1834 }
1835 if (mass != particles->mass[tree->child[bodyIndex + offset]]) {
1836 printf("testTree: index: %i mass %f vs %f (%f, %f, %f, %f, %f, %f, %f, %f)\n", bodyIndex + offset, mass, particles->mass[tree->child[bodyIndex + offset]],
1837 masses[0], masses[1], masses[2], masses[3], masses[4], masses[5], masses[6], masses[7]);
1838 }
1839
1840 offset += stride;
1841 }
1842
1843 //while (bodyIndex + offset < n) {
1844 // if (particles->x[bodyIndex + offset] == 0.f &&
1845 // particles->y[bodyIndex + offset] == 0.f &&
1846 // particles->z[bodyIndex + offset] == 0.f &&
1847 // particles->mass[bodyIndex + offset] == 0.f) {
1848 // printf("particle ZERO for index = %i: (%f, %f, %f) %f\n", bodyIndex + offset,
1849 // particles->x[bodyIndex + offset], particles->y[bodyIndex + offset],
1850 // particles->z[bodyIndex + offset], particles->mass[bodyIndex + offset]);
1851 // }
1852 //
1853 // offset += stride;
1854 //}
1855 //offset = m;
1856 //while (bodyIndex + offset < *tree->index) {
1857 // if (particles->x[bodyIndex + offset] == 0.f &&
1858 // particles->y[bodyIndex + offset] == 0.f &&
1859 // particles->z[bodyIndex + offset] == 0.f &&
1860 // particles->mass[bodyIndex + offset] == 0.f) {
1861 // printf("particle ZERO for index = %i: (%f, %f, %f) %f\n", bodyIndex + offset,
1862 // particles->x[bodyIndex + offset], particles->y[bodyIndex + offset],
1863 // particles->z[bodyIndex + offset], particles->mass[bodyIndex + offset]);
1864 // }
1865 // offset += stride;
1866 //}
1867 }
1868
1869 void Launch::set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1870 integer *index, integer *toDeleteLeaf, integer *toDeleteNode , real *minX, real *maxX) {
1871 ExecutionPolicy executionPolicy(1, 1);
1872 cuda::launch(false, executionPolicy, ::TreeNS::Kernel::set, tree, count, start, child, sorted,
1873 index, toDeleteLeaf, toDeleteNode, minX, maxX);
1874 }
1875
1876 real Launch::info(Tree *tree, Particles *particles, integer n, integer m) {
1877 ExecutionPolicy executionPolicy;
1878 return cuda::launch(true, executionPolicy, ::TreeNS::Kernel::info, tree, particles, n, m);
1879 }
1880
1881 real Launch::info(Tree *tree, Particles *particles) {
1882 ExecutionPolicy executionPolicy;
1883 return cuda::launch(true, executionPolicy, ::TreeNS::Kernel::info, tree, particles);
1884 }
1885
1886 real Launch::testTree(Tree *tree, Particles *particles, integer n, integer m) {
1887 ExecutionPolicy executionPolicy;
1888 return cuda::launch(true, executionPolicy, ::TreeNS::Kernel::testTree, tree, particles, n, m);
1889 }
1890
1891#if DIM > 1
1892
1893 __global__ void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1894 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX,
1895 real *minY, real *maxY) {
1896 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY);
1897 }
1898
1899 void Launch::set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1900 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX,
1901 real *minY, real *maxY) {
1902 ExecutionPolicy executionPolicy(1, 1);
1903 cuda::launch(false, executionPolicy, ::TreeNS::Kernel::set, tree, count, start, child, sorted, index,
1904 toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY);
1905 }
1906
1907#if DIM == 3
1908
1909 __global__ void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1910 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX,
1911 real *minY, real *maxY, real *minZ, real *maxZ) {
1912 tree->set(count, start, child, sorted, index, toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY,
1913 minZ, maxZ);
1914 }
1915
1916 void Launch::set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted,
1917 integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX,
1918 real *minY, real *maxY, real *minZ, real *maxZ) {
1919 ExecutionPolicy executionPolicy(1, 1);
1920 cuda::launch(false, executionPolicy, ::TreeNS::Kernel::set, tree, count, start, child, sorted, index,
1921 toDeleteLeaf, toDeleteNode, minX, maxX, minY, maxY, minZ, maxZ);
1922 }
1923
1924#endif
1925#endif
1926
1927 namespace Launch {
1928
1929 real sumParticles(Tree *tree) {
1930 ExecutionPolicy executionPolicy;
1931 return cuda::launch(true, executionPolicy, ::TreeNS::Kernel::sumParticles, tree);
1932 }
1933
1934 real buildTree(Tree *tree, Particles *particles, integer n, integer m, bool time) {
1935 ExecutionPolicy executionPolicy(24, 32);
1936 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::buildTree, tree, particles, n, m);
1937 }
1938
1939 real prepareSorting(Tree *tree, Particles *particles, integer n, integer m) {
1940 ExecutionPolicy executionPolicy;
1941 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::prepareSorting, tree, particles, n, m);
1942 }
1943
1944 real calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level, bool time) {
1945 ExecutionPolicy executionPolicy;
1946 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::calculateCentersOfMass, tree, particles, n, level);
1947 }
1948
1949 real computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize,
1950 bool time) {
1951 size_t sharedMemory = 2 * DIM * sizeof(real) * blockSize;
1952 ExecutionPolicy executionPolicy(4, 256, sharedMemory);
1953 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::computeBoundingBox, tree, particles, mutex,
1954 n, blockSize);
1955 }
1956
1957 real centerOfMass(Tree *tree, Particles *particles, integer n, bool time) {
1958 ExecutionPolicy executionPolicy;
1959 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::centerOfMass, tree, particles, n);
1960 }
1961
1962 real sort(Tree *tree, integer n, integer m, bool time) {
1963 ExecutionPolicy executionPolicy(1024, 1024);
1964 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::sort, tree, n, m);
1965 }
1966
1967 real getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n,
1968 Curve::Type curveType, bool time) {
1969 ExecutionPolicy executionPolicy;
1970 return cuda::launch(time, executionPolicy, ::TreeNS::Kernel::getParticleKeys, tree, particles, keys,
1971 maxLevel, n, curveType);
1972 }
1973
1974 real globalCOM(Tree *tree, Particles *particles, real com[DIM]) {
1975 ExecutionPolicy executionPolicy(1, 1);
1976 return cuda::launch(true, executionPolicy, ::TreeNS::Kernel::globalCOM, tree, particles, com);
1977 }
1978 }
1979 }
1980}
1981
1982
1983#if UNIT_TESTING
1984namespace UnitTesting {
1985 namespace Kernel {
1986
1987 __global__ void test_localTree(Tree *tree, Particles *particles, int n, int m) {
1988
1989 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1990 int stride = blockDim.x * gridDim.x;
1991
1992 real childMasses;
1993 real positionX;
1994#if DIM > 1
1995 real positionY;
1996#if DIM == 3
1997 real positionZ;
1998#endif
1999#endif
2000 int childIndex;
2001 // check pseudo-particles
2002 int offset = n;
2003 while ((bodyIndex + offset) < m) {
2004 // check whether pseudo-particles are correctly calculated
2005 childMasses = 0.;
2006 positionX = 0.;
2007#if DIM > 1
2008 positionY = 0.;
2009#if DIM == 3
2010 positionZ = 0.;
2011#endif
2012#endif
2013 for (int child=0; child<POW_DIM; child++) {
2014 childIndex = tree->child[POW_DIM * (bodyIndex + offset) + child];
2015
2016 if (childIndex != -1) {
2017 childMasses += particles->mass[childIndex];
2018 positionX += particles->mass[childIndex] * particles->x[childIndex];
2019#if DIM > 1
2020 positionY += particles->mass[childIndex] * particles->y[childIndex];
2021#if DIM == 3
2022 positionZ += particles->mass[childIndex] * particles->z[childIndex];
2023#endif
2024#endif
2025 }
2026 }
2027 if (childMasses > 0.) {
2028 positionX /= childMasses;
2029#if DIM > 1
2030 positionY /= childMasses;
2031#if DIM == 3
2032 positionZ /= childMasses;
2033#endif
2034#endif
2035 }
2036 //if (particles->nodeType[bodyIndex + offset] == 1) { // <
2037 // printf("Masses [%i] ?: %e vs %e (type: %i)!\n", bodyIndex + offset, particles->mass[bodyIndex + offset], childMasses, particles->nodeType[bodyIndex + offset]);
2038 //}
2039 // now compare
2040 if (particles->mass[bodyIndex + offset] != childMasses) {
2041 if (particles->nodeType[bodyIndex + offset] != 2) { // >=
2042 printf("Masses are not correct [%i]: %e vs %e (type: %i)!\n", bodyIndex + offset, particles->mass[bodyIndex + offset], childMasses, particles->nodeType[bodyIndex + offset]);
2043 //for (int child=0; child<POW_DIM; child++) {
2044 // printf("[%i] Masses: index: %i\n", bodyIndex + offset, tree->child[POW_DIM * (bodyIndex + offset) + child]);
2045 //}
2046 }
2047 }
2048 if (cuda::math::abs(particles->x[bodyIndex + offset] - positionX) > 1e-3) {
2049 if (particles->nodeType[bodyIndex + offset] != 2) {
2050 printf("Masses... X position are not correct [%i]: %e vs %e (m = %e vs %e)!\n", bodyIndex + offset,
2051 particles->x[bodyIndex + offset], positionX, particles->mass[bodyIndex + offset], childMasses);
2052 for (int child=0; child<POW_DIM; child++) {
2053 printf("[%i] Masses: index: %i\n", bodyIndex + offset, tree->child[POW_DIM * (bodyIndex + offset) + child]);
2054 }
2055 }
2056 }
2057#if DIM > 1
2058 if (cuda::math::abs(particles->y[bodyIndex + offset] - positionY) > 1e-3) {
2059 if (particles->nodeType[bodyIndex + offset] != 2) {
2060 printf("Masses... Y position are not correct: %e vs %e (m = %e vs %e)!\n", particles->y[bodyIndex + offset], positionY, particles->mass[bodyIndex + offset], childMasses);
2061 }
2062 }
2063#if DIM == 3
2064 if (cuda::math::abs(particles->z[bodyIndex + offset] - positionZ) > 1e-3) {
2065 if (particles->nodeType[bodyIndex + offset] != 2) {
2066 printf("Masses... Z position are not correct: %e vs %e (m = %e vs %e)!\n", particles->z[bodyIndex + offset], positionZ, particles->mass[bodyIndex + offset], childMasses);
2067 }
2068 }
2069#endif
2070#endif
2071 offset += stride;
2072 }
2073 }
2074
2075 namespace Launch {
2076 real test_localTree(Tree *tree, Particles *particles, int n, int m) {
2077 ExecutionPolicy executionPolicy;
2078 return cuda::launch(true, executionPolicy, ::UnitTesting::Kernel::test_localTree, tree, particles, n, m);
2079 }
2080 }
2081 }
2082}
2083#endif
ExecutionPolicy
Execution policy/instruction for CUDA kernel execution.
Definition: cuda_launcher.cuh:33
Particles
Particle(s) class based on SoA (Structur of Arrays).
Definition: particles.cuh:50
Particles::weightedEntry
CUDA_CALLABLE_MEMBER real weightedEntry(integer index, Entry::Name entry)
Definition: particles.cu:339
Particles::x
real * x
(pointer to) x position (array)
Definition: particles.cuh:62
Particles::level
integer * level
(pointer to) level of the (pseudo-)particles
Definition: particles.cuh:106
Particles::nodeType
integer * nodeType
(pointer to) node type
Definition: particles.cuh:103
Particles::y
real * y
(pointer to) y position (array)
Definition: particles.cuh:70
Particles::mass
real * mass
(pointer to) mass (array)
Definition: particles.cuh:60
Particles::z
real * z
(pointer to) z position (array)
Definition: particles.cuh:78
Tree
Tree class.
Definition: tree.cuh:140
Tree::reset
CUDA_CALLABLE_MEMBER void reset(integer index, integer n)
Reset (specific) entries.
Definition: tree.cu:133
Tree::index
integer * index
index used for counting nodes
Definition: tree.cuh:154
Tree::minZ
real * minZ
bounding box minimal z
Definition: tree.cuh:172
Tree::child
integer * child
children/child nodes or leaves (beneath)
Definition: tree.cuh:150
Tree::getParticleKey
CUDA_CALLABLE_MEMBER keyType getParticleKey(Particles *particles, integer index, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Get SFC key of a particle.
Definition: tree.cu:157
Tree::maxX
real * maxX
bounding box maximal x
Definition: tree.cuh:164
Tree::sorted
integer * sorted
sorted (indices) for better cache performance
Definition: tree.cuh:152
Tree::count
integer * count
accumulated nodes/leaves beneath
Definition: tree.cuh:146
Tree::sumParticles
CUDA_CALLABLE_MEMBER integer sumParticles()
Sum particles in tree.
Definition: tree.cu:288
Tree::toDeleteNode
integer * toDeleteNode
buffer for remembering old indices for rebuilding tree
Definition: tree.cuh:159
Tree::~Tree
CUDA_CALLABLE_MEMBER ~Tree()
Destructor.
Definition: tree.cu:298
Tree::Tree
CUDA_CALLABLE_MEMBER Tree()
Default constructor.
Definition: tree.cu:55
Tree::minX
real * minX
bounding box minimal x
Definition: tree.cuh:162
Tree::start
integer * start
TODO: describe start.
Definition: tree.cuh:148
Tree::toDeleteLeaf
integer * toDeleteLeaf
buffer for remembering old indices for rebuilding tree
Definition: tree.cuh:157
Tree::getTreeLevel
CUDA_CALLABLE_MEMBER integer getTreeLevel(Particles *particles, integer index, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Get tree level for a (specific) particle.
Definition: tree.cu:242
Tree::set
CUDA_CALLABLE_MEMBER void set(integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Setter, passing pointer to member variables.
Definition: tree.cu:65
Tree::maxY
real * maxY
bounding box maximal y
Definition: tree.cuh:169
Tree::minY
real * minY
bounding box minimal y
Definition: tree.cuh:167
Tree::maxZ
real * maxZ
bounding box maximal z
Definition: tree.cuh:174
cudaAssert
#define cudaAssert(...)
Definition: cuda_utilities.cuh:50
CUDA_CALLABLE_MEMBER
#define CUDA_CALLABLE_MEMBER
Definition: cuda_utilities.cuh:30
Kernel
Definition: device_rhs.cuh:7
KeyNS::HilbertTable
const unsigned char HilbertTable[12][8]
Table needed to convert from Lebesgue to Hilbert keys.
Definition: tree.cuh:83
KeyNS::lebesgue2hilbert
CUDA_CALLABLE_MEMBER keyType lebesgue2hilbert(keyType lebesgue, integer maxLevel)
Convert a Lebesgue key to a Hilbert key.
Definition: tree.cu:4
MaterialNS::Kernel::info
__global__ void info(Material *material)
Debug kernel giving information about material(s).
Definition: material.cu:24
ProfilerIds::Time::tree
const char *const tree
Definition: h5profiler.h:57
SPH::Kernel::calculateCentersOfMass
__global__ void calculateCentersOfMass(Tree *tree, Particles *particles, integer level)
Definition: sph.cu:2551
SubDomainKeyTreeNS::Kernel::getParticleKeys
__global__ void getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle keys (and additional information for debugging purposes).
Definition: subdomain.cu:849
TreeNS::Kernel::Launch::calculateCentersOfMass
real calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level, bool time=false)
Definition: tree.cu:1944
TreeNS::Kernel::Launch::prepareSorting
real prepareSorting(Tree *tree, Particles *particles, integer n, integer m)
Definition: tree.cu:1939
TreeNS::Kernel::Launch::sort
real sort(Tree *tree, integer n, integer m, bool time=false)
Wrapper for TreeNS::Kernel::sort()
Definition: tree.cu:1962
TreeNS::Kernel::Launch::centerOfMass
real centerOfMass(Tree *tree, Particles *particles, integer n, bool time=false)
Wrapper for TreeNS::Kernel::centerOfMass()
Definition: tree.cu:1957
TreeNS::Kernel::Launch::sumParticles
real sumParticles(Tree *tree)
Wrapper for TreeNS::Kernel::sumParticles()
Definition: tree.cu:1929
TreeNS::Kernel::Launch::globalCOM
real globalCOM(Tree *tree, Particles *particles, real com[DIM])
Wrapper for TreeNS::Kernel::globalCOM()
Definition: tree.cu:1974
TreeNS::Kernel::Launch::testTree
real testTree(Tree *tree, Particles *particles, integer n, integer m)
Definition: tree.cu:1886
TreeNS::Kernel::Launch::computeBoundingBox
real computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize, bool time=false)
Wrapper for TreeNS::Kernel::computeBoundingBox()
Definition: tree.cu:1949
TreeNS::Kernel::Launch::set
void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Wrapper for TreeNS::Kernel::set()
Definition: tree.cu:1869
TreeNS::Kernel::Launch::getParticleKeys
real getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue, bool time=false)
Wrapper for TreeNS::Kernel::getParticleKeys()
Definition: tree.cu:1967
TreeNS::Kernel::Launch::info
real info(Tree *tree, Particles *particles, integer n, integer m)
Wrapper for TreeNS::Kernel::Launch::info()
Definition: tree.cu:1876
TreeNS::Kernel::Launch::buildTree
real buildTree(Tree *tree, Particles *particles, integer n, integer m, bool time=false)
Wrapper for TreeNS::Kernel::buildTree()
Definition: tree.cu:1934
TreeNS::Kernel::prepareSorting
__global__ void prepareSorting(Tree *tree, Particles *particles, integer n, integer m)
Definition: tree.cu:1404
TreeNS::Kernel::set
__global__ void set(Tree *tree, integer *count, integer *start, integer *child, integer *sorted, integer *index, integer *toDeleteLeaf, integer *toDeleteNode, real *minX, real *maxX)
Kernel call to setter.
Definition: tree.cu:1679
TreeNS::Kernel::testTree
__global__ void testTree(Tree *tree, Particles *particles, integer n, integer m)
Definition: tree.cu:1815
TreeNS::Kernel::info
__global__ void info(Tree *tree, Particles *particles, integer n, integer m)
Info Kernel for tree class (for debugging purposes)
Definition: tree.cu:1684
TreeNS::Kernel::getParticleKeys
__global__ void getParticleKeys(Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle's keys.
Definition: tree.cu:1618
TreeNS::Kernel::computeBoundingBox
__global__ void computeBoundingBox(Tree *tree, Particles *particles, integer *mutex, integer n, integer blockSize)
Kernel to compute the bounding box/borders of the tree or rather the particles within the tree.
Definition: tree.cu:302
TreeNS::Kernel::calculateCentersOfMass
__global__ void calculateCentersOfMass(Tree *tree, Particles *particles, integer n, integer level)
Compute center of masses (level wise).
Definition: tree.cu:1416
TreeNS::Kernel::sumParticles
__global__ void sumParticles(Tree *tree)
Kernel call to sum particles within tree.
Definition: tree.cu:440
TreeNS::Kernel::sort
__global__ void sort(Tree *tree, integer n, integer m)
Kernel to sort tree/child indices to optimize cache efficiency.
Definition: tree.cu:1527
TreeNS::Kernel::buildTree
__global__ void buildTree(Tree *tree, Particles *particles, integer n, integer m)
Kernel to construct the tree using the particles within particles
Definition: tree.cu:768
TreeNS::Kernel::globalCOM
__global__ void globalCOM(Tree *tree, Particles *particles, real com[DIM])
Compute center of mass for all particles.
Definition: tree.cu:1645
TreeNS::Kernel::centerOfMass
__global__ void centerOfMass(Tree *tree, Particles *particles, integer n)
Definition: tree.cu:1501
TreeNS
Definition: tree.cuh:354
cuda::math::min
__device__ real min(real a, real b)
Minimum value out of two floating point values.
Definition: cuda_utilities.cu:414
cuda::math::abs
__device__ real abs(real a)
Absolute value of a floating point value.
Definition: cuda_utilities.cu:448
cuda::math::max
__device__ real max(real a, real b)
Maximum value out of two floating point values.
Definition: cuda_utilities.cu:431
cuda::set
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
Definition: cuda_runtime.h:56
cuda::launch
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
Definition: cuda_launcher.cuh:114
real
double real
Definition: parameter.h:15
keyType
unsigned long keyType
Definition: parameter.h:18
integer
int integer
Definition: parameter.h:17
MAX_LEVEL
#define MAX_LEVEL
Definition: parameter.h:25
POW_DIM
#define POW_DIM
Definition: parameter.h:40
DIM
#define DIM
Dimension of the problem.
Definition: parameter.h:38
Curve::Type
Type
Definition: parameter.h:206
Curve::lebesgue
@ lebesgue
Definition: parameter.h:207
Curve::hilbert
@ hilbert
Definition: parameter.h:207
Entry::y
@ y
Definition: parameter.h:258
Entry::x
@ x
Definition: parameter.h:256
Entry::z
@ z
Definition: parameter.h:260

milupHPC - src/subdomain_key_tree/tree.cu Source File
Generated on Wed Aug 31 2022 12:16:53 by Doxygen 1.9.3