milupHPC documentation
  • src
  • gravity
gravity.cu
Go to the documentation of this file.
1#include "../../include/gravity/gravity.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
3
4namespace Gravity {
5
6 namespace Kernel {
7
8 __global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices,
9 integer *particles2Send, integer *pseudoParticles2Send,
10 integer *pseudoParticlesLevel,
11 integer *particlesCount, integer *pseudoParticlesCount,
12 integer n, integer length, Curve::Type curveType) {
13
14 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
15 integer stride = blockDim.x*gridDim.x;
16 integer offset = 0;
17
18 integer particleInsertIndex;
19 integer pseudoParticleInsertIndex;
20
21 while ((bodyIndex + offset) < length) {
22
23 if (sendIndices[bodyIndex + offset] == 1) { // TODO: >= 1 or == 1 (was == 1)
24
25 // it is a particle
26 if (bodyIndex + offset < n) {
27 particleInsertIndex = atomicAdd(particlesCount, 1);
28 particles2Send[particleInsertIndex] = bodyIndex + offset;
29 }
30 // it is a pseudo-particle
31 else {
32 pseudoParticleInsertIndex = atomicAdd(pseudoParticlesCount, 1);
33 pseudoParticles2Send[pseudoParticleInsertIndex] = bodyIndex + offset;
34 pseudoParticlesLevel[pseudoParticleInsertIndex] = particles->level[bodyIndex + offset];
35 //printf("pseudo-particle level to be sent: %i (%i)\n", particles->level[bodyIndex + offset],
36 // bodyIndex + offset);
37 //pseudoParticlesLevel[pseudoParticleInsertIndex] = tree->getTreeLevel(particles,
38 // bodyIndex + offset,
39 // MAX_LEVEL, curveType);
40
41 // debug
42 //if (pseudoParticlesLevel[pseudoParticleInsertIndex] == -1) {
43 // printf("level = -1 within collectSendIndices for index: %i\n", bodyIndex + offset);
44 //}
45 // end: debug
46 }
47 }
48 __threadfence();
49 offset += stride;
50 }
51 }
52
53 __global__ void collectSendIndices_test4(Tree *tree, Particles *particles, integer *sendIndices,
54 integer *particles2Send, integer *pseudoParticles2Send,
55 integer *pseudoParticlesLevel,
56 integer *particlesCount, integer *pseudoParticlesCount,
57 integer numParticlesLocal, integer numParticles,
58 integer treeIndex, int currentProc, Curve::Type curveType) {
59
60 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
61 integer stride = blockDim.x*gridDim.x;
62 integer offset = 0;
63
64 integer particleInsertIndex;
65 integer pseudoParticleInsertIndex;
66
67 // it is a particle
68 while ((bodyIndex + offset) < numParticlesLocal) {
69 if ((sendIndices[bodyIndex + offset] >> currentProc) & 1) { // TODO: >= 1 or == 1 (was == 1)
70 if (bodyIndex + offset < numParticlesLocal) {
71 particleInsertIndex = atomicAdd(particlesCount, 1);
72 particles2Send[particleInsertIndex] = bodyIndex + offset;
73 }
74 }
75 __threadfence();
76 offset += stride;
77 }
78
79 // it is a pseudo-particle
80 offset = numParticles;
81 while ((bodyIndex + offset) < treeIndex) {
82 if ((sendIndices[bodyIndex + offset] >> currentProc) & 1) {
83 pseudoParticleInsertIndex = atomicAdd(pseudoParticlesCount, 1);
84 pseudoParticles2Send[pseudoParticleInsertIndex] = bodyIndex + offset;
85 pseudoParticlesLevel[pseudoParticleInsertIndex] = particles->level[bodyIndex + offset];
86 //printf("pseudo-particle level to be sent: %i (%i)\n", particles->level[bodyIndex + offset],
87 // bodyIndex + offset);
88 //pseudoParticlesLevel[pseudoParticleInsertIndex] = tree->getTreeLevel(particles,
89 // bodyIndex + offset,
90 // MAX_LEVEL, curveType);
91
92 // debug
93 //if (pseudoParticlesLevel[pseudoParticleInsertIndex] == -1) {
94 // printf("level = -1 within collectSendIndices for index: %i\n", bodyIndex + offset);
95 //}
96 // end: debug
97 }
98 __threadfence();
99 offset += stride;
100 }
101 }
102
103 __global__ void testSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
104 integer *sendIndices, integer *markedSendIndices,
105 integer *levels, Curve::Type curveType, integer length) {
106
107 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
108 integer stride = blockDim.x*gridDim.x;
109 integer offset = 0;
110
111 integer key;
112 integer temp;
113 integer childIndex;
114 integer childPath;
115
116 bool available = false;
117
118 real min_x, max_x;
119#if DIM > 1
120 real min_y, max_y;
121#if DIM == 3
122 real min_z, max_z;
123#endif
124#endif
125
126 //while ((bodyIndex + offset) < length) {
127 // printf("x[%i] = (%f, %f, %f) %f\n", sendIndices[bodyIndex + offset], particles->x[sendIndices[bodyIndex + offset]],
128 // particles->y[sendIndices[bodyIndex + offset]], particles->z[sendIndices[bodyIndex + offset]],
129 // particles->mass[sendIndices[bodyIndex + offset]]);
130 // offset += stride;
131 //}
132
133 //while ((bodyIndex + offset) < length) {
134 // if (particles->x[sendIndices[bodyIndex + offset]] == 0.f &&
135 // particles->y[sendIndices[bodyIndex + offset]] == 0.f &&
136 // particles->z[sendIndices[bodyIndex + offset]] == 0.f &&
137 // particles->mass[sendIndices[bodyIndex + offset]]) {
138 //
139 // }
140 // offset += stride;
141 //}
142
143 // ///////////////////////////////////////////////////////////////////////////////////
144
145 //if (bodyIndex == 0) {
146 // for (int i = 0; i<10; i++) {
147 // printf("sendIndices[%i] = %i (length = %i)\n", length - 1 + i, sendIndices[length -1 + i], length);
148 // }
149 //}
150
151 //if (bodyIndex == 0) {
152 // integer i=0;
153 // for (int i = 0; i<30000; i++) {
154 // if (markedSendIndices[100000 + i] == 1) {
155 // printf("[rank %i] markedSendIndices[%i] = %i!\n", subDomainKeyTree->rank, 100000 + i, markedSendIndices[100000 + i]);
156 // break;
157 // }
158 // }
159 //}
160
161 while ((bodyIndex + offset) < length) {
162
163 //printf("index = %i sendIndex = %i level = %i\n", bodyIndex + offset, sendIndices[bodyIndex + offset],
164 // levels[bodyIndex + offset]);
165
166 min_x = *tree->minX;
167 max_x = *tree->maxX;
168#if DIM > 1
169 min_y = *tree->minY;
170 max_y = *tree->maxY;
171#if DIM == 3
172 min_z = *tree->minZ;
173 max_z = *tree->maxZ;
174#endif
175#endif
176
177 available = false;
178
179 childIndex = 0;
180 if (levels[bodyIndex + offset] > 3) {
181 //key = tree->getParticleKey(particles, bodyIndex + offset + tree->toDeleteNode[0], MAX_LEVEL,
182 // curveType);
183
184 //printf("level = %i\n", levels[bodyIndex + offset]);
185
186 childIndex = 0;
187
188 for (int j = 0; j < levels[bodyIndex + offset] - 1; j++) {
189
190 temp = childIndex;
191
192 childPath = 0;
193 if (particles->x[sendIndices[bodyIndex + offset]] < 0.5 * (min_x + max_x)) {
194 childPath += 1;
195 max_x = 0.5 * (min_x + max_x);
196 } else {
197 min_x = 0.5 * (min_x + max_x);
198 }
199#if DIM > 1
200 if (particles->y[sendIndices[bodyIndex + offset]] < 0.5 * (min_y + max_y)) {
201 childPath += 2;
202 max_y = 0.5 * (min_y + max_y);
203 } else {
204 min_y = 0.5 * (min_y + max_y);
205 }
206#if DIM == 3
207 if (particles->z[sendIndices[bodyIndex + offset]] < 0.5 * (min_z + max_z)) {
208 childPath += 4;
209 max_z = 0.5 * (min_z + max_z);
210 } else {
211 min_z = 0.5 * (min_z + max_z);
212 }
213#endif
214#endif
215 //printf("childIndex = %i\n", childIndex);
216 childIndex = tree->child[POW_DIM * temp + childPath];
217 if (bodyIndex + offset == 0) {
218 printf("tree->child[POW_DIM * %i + %i] = %i (%i)\n", temp, childPath, tree->child[POW_DIM * temp + childPath], sendIndices[bodyIndex + offset]);
219 }
220 }
221
222
223 for (int i = 0; i < length; i++) {
224 if (temp == sendIndices[i]) {
225 available = true;
226 break;
227 }
228 }
229
230 if (!available) {
231 //integer a = -1;
232 //markedSendIndices[childIndex] = a;
233
234 cudaTerminate("[rank %i] %i (relevant son: %i) NOT Available sendIndices[%i] = %i, [%i] = %i)!\n",
235 subDomainKeyTree->rank, temp, childIndex, childIndex,
236 markedSendIndices[childIndex], temp, markedSendIndices[temp])
237 }
238
239 //if (childIndex != sendIndices[bodyIndex + offset]) {
240 //printf("ATTENTION childIndex != bodyIndex level = %i (%i != %i) (%f, %f, %f)!\n", levels[bodyIndex + offset], childIndex, sendIndices[bodyIndex + offset],
241 // particles->x[sendIndices[bodyIndex + offset]], particles->y[sendIndices[bodyIndex + offset]],
242 // particles->z[sendIndices[bodyIndex + offset]]);
243 //} else {
244 //printf("--\n");
245 //}
246 }
247
248 offset += stride;
249 }
250 }
251
252 __global__ void computeForces_v1(Tree *tree, Particles *particles, real radius, integer n, integer m,
253 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
254 bool potentialEnergy) {
255
256 register int i, ii;
257 int child, nodeIndex, childNumber, depth;
258
259 real px, ax, dx, f, distance;
260#if DIM > 1
261 real py, ay, dy;
262#if DIM == 3
263 real pz, az, dz;
264#endif
265#endif
266 real sml;
267 real thetasq = theta*theta;
268
269 int currentNodeIndex[MAX_DEPTH];
270 int currentChildNumber[MAX_DEPTH];
271
272 __shared__ volatile real cellSize[MAX_DEPTH];
273
274 if (threadIdx.x == 0) {
275 cellSize[0] = 4.0 * radius * radius; //4.0 * radius * radius;
276#pragma unroll
277 for (i = 1; i < MAX_DEPTH; i++) {
278 cellSize[i] = cellSize[i - 1] * 0.25;
279 }
280 }
281
282 __syncthreads();
283
284 for (ii = threadIdx.x + blockIdx.x * blockDim.x; ii < n; ii += blockDim.x * gridDim.x) {
285
286 i = tree->sorted[ii]; //i = ii;
287
288 px = particles->x[i];
289#if DIM > 1
290 py = particles->y[i];
291#if DIM == 3
292 pz = particles->z[i];
293#endif
294#endif
295 // TODO: resetting not really necessary?!
296 //particles->ax[i] = 0.0;
297 particles->g_ax[i] = 0.0;
298#if DIM > 1
299 //particles->ay[i] = 0.0;
300 particles->g_ay[i] = 0.0;
301#if DIM == 3
302 //particles->az[i] = 0.0;
303 particles->g_az[i] = 0.0;
304#endif
305#endif
306 ax = 0.0;
307#if DIM > 1
308 ay = 0.0;
309#if DIM == 3
310 az = 0.0;
311#endif
312#endif
313
314 // start at root
315 depth = 1;
316 currentNodeIndex[depth] = 0;
317 currentChildNumber[depth] = 0;
318
319 do {
320 childNumber = currentChildNumber[depth];
321 nodeIndex = currentNodeIndex[depth];
322
323 while(childNumber < POW_DIM) {
324
325 do {
326 child = tree->child[POW_DIM * nodeIndex + childNumber];
327 childNumber++;
328 } while(child == -1 && childNumber < POW_DIM);
329
330 if (child != -1 && child != i) { // dont do self-gravity with yourself!
331
332 dx = particles->x[child] - px;
333 distance = dx*dx + smoothing;
334#if DIM > 1
335 dy = particles->y[child] - py;
336 distance += dy*dy;
337#endif
338#if DIM == 3
339 dz = particles->z[child] - pz;
340 distance += dz*dz;
341#endif
342 // if child is leaf or far away
343 if (child < n || distance * thetasq > cellSize[depth]) {
344
345 distance = cuda::math::sqrt(distance);
346#if SI_UNITS
347 f = Constants::G * particles->mass[child] / (distance * distance * distance);
348#else
349 f = particles->mass[child] / (distance * distance * distance);
350#endif
351
352 ax += f*dx;
353#if DIM > 1
354 ay += f*dy;
355#if DIM == 3
356 az += f*dz;
357#endif
358#endif
359 // gravitational potential energy
360 if (potentialEnergy) {
361#if SI_UNITS
362 particles->u[i] -= 0.5 * (Constants::G * particles->mass[child] * particles->mass[i]) / distance;
363#else
364 particles->u[i] -= 0.5 * (particles->mass[child] * particles->mass[i]) / distance;
365#endif
366 }
367 } else {
368 // put child on stack
369 currentChildNumber[depth] = childNumber;
370 currentNodeIndex[depth] = nodeIndex;
371 //if (particles->nodeType[child] != 3) {
372 depth++;
373 //}
374 if (depth == MAX_DEPTH) {
375 cudaTerminate("depth = %i >= MAX_DEPTH = %i\n", depth, MAX_DEPTH);
376 }
377 childNumber = 0;
378 nodeIndex = child;
379 }
380 }
381 }
382 depth--;
383 } while(depth > 0);
384
385 //particles->ax[i] = ax;
386 particles->g_ax[i] = ax;
387#if DIM > 1
388 //particles->ay[i] = ay;
389 particles->g_ay[i] = ay;
390#if DIM == 3
391 //particles->az[i] = az;
392 particles->g_az[i] = az;
393#endif
394#endif
395 }
396 }
397
398 __global__ void computeForces_v1_1(Tree *tree, Particles *particles, real radius, integer n, integer m,
399 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
400 bool potentialEnergy) {
401
402 integer i, ii, child, nodeIndex, childNumber, depth;
403
404 real px, ax, dx, f, distance;
405#if DIM > 1
406 real py, ay, dy;
407#if DIM == 3
408 real pz, az, dz;
409#endif
410#endif
411
412 real sml;
413 real thetasq = theta*theta;
414
415 integer currentNodeIndex[MAX_DEPTH];
416 integer currentChildNumber[MAX_DEPTH];
417
418 __shared__ volatile real cellSize[MAX_DEPTH];
419
420 if (threadIdx.x == 0) {
421 cellSize[0] = 4.0 * radius * radius;
422#pragma unroll
423 for (i = 1; i < MAX_DEPTH; i++) {
424 cellSize[i] = cellSize[i - 1] * 0.25;
425 }
426 }
427
428 __syncthreads();
429
430 for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
431
432 px = particles->x[i];
433#if DIM > 1
434 py = particles->y[i];
435#if DIM == 3
436 pz = particles->z[i];
437#endif
438#endif
439 //particles->ax[i] = 0.0;
440 particles->g_ax[i] = 0.0;
441#if DIM > 1
442 //particles->ay[i] = 0.0;
443 particles->g_ay[i] = 0.0;
444#if DIM == 3
445 //particles->az[i] = 0.0;
446 particles->g_az[i] = 0.0;
447#endif
448#endif
449 ax = 0.0;
450#if DIM > 1
451 ay = 0.0;
452#if DIM == 3
453 az = 0.0;
454#endif
455#endif
456
457 // start at root
458 depth = 1;
459 currentNodeIndex[depth] = 0;
460 currentChildNumber[depth] = 0;
461
462 do {
463 childNumber = currentChildNumber[depth];
464 nodeIndex = currentNodeIndex[depth];
465
466 while(childNumber < POW_DIM) {
467 do {
468 child = tree->child[POW_DIM * nodeIndex + childNumber]; //childList[childListIndex(nodeIndex, childNumber)];
469 childNumber++;
470 } while(child == -1 && childNumber < POW_DIM);
471 if (child != -1 && child != i) { // dont do self-gravity with yourself!
472 dx = particles->x[child] - px;
473 distance = dx*dx + smoothing; //150329404.287723; //(0.0317 * 0.0317); //0.025;
474#if DIM > 1
475 dy = particles->y[child] - py;
476 distance += dy*dy;
477#endif
478#if DIM == 3
479 dz = particles->z[child] - pz;
480 distance += dz*dz;
481#endif
482 // if child is leaf or far away
483 if (child < n || distance * thetasq > cellSize[depth]) {
484 //if (particles->nodeType[child] == 3 || particles->nodeType[child] == -10) {
485 // printf("Taking node type %i into account...\n", particles->nodeType[child]);
486 // if (particles->nodeType[child] == 3) {
487 // for (int i_test=0; i_test<POW_DIM; ++i_test) {
488 // printf("%i node type %i: (%e, %e, %e | %e), child %i: %i (%e, %e, %e | %e)\n",
489 // child, particles->nodeType[child], particles->x[child], particles->y[child], particles->z[child],
490 // particles->mass[child], i_test,
491 // tree->child[POW_DIM * child + i_test],
492 // particles->x[tree->child[POW_DIM * child + i_test]],
493 // particles->y[tree->child[POW_DIM * child + i_test]],
494 // particles->z[tree->child[POW_DIM * child + i_test]],
495 // particles->mass[tree->child[POW_DIM * child + i_test]]);
496 // }
497 // }
498 //}
499 distance = sqrt(distance);
500#if SI_UNITS
501 f = Constants::G * particles->mass[child] / (distance * distance * distance);
502#else
503 f = particles->mass[child] / (distance * distance * distance);
504#endif
505
506 ax += f*dx;
507#if DIM > 1
508 ay += f*dy;
509#if DIM == 3
510 az += f*dz;
511#endif
512#endif
513 // gravitational potential energy
514 if (potentialEnergy) {
515#if SI_UNITS
516 particles->u[i] -= 0.5 * (Constants::G * particles->mass[child] * particles->mass[i])/distance;
517#else
518 particles->u[i] -= 0.5 * (particles->mass[child] * particles->mass[i])/distance;
519#endif
520 }
521 } else {
522 // put child on stack
523 currentChildNumber[depth] = childNumber;
524 currentNodeIndex[depth] = nodeIndex;
525 depth++;
526 if (depth == MAX_DEPTH) {
527 cudaTerminate("depth = %i >= MAX_DEPTH = %i\n", depth, MAX_DEPTH);
528 }
529 childNumber = 0;
530 nodeIndex = child;
531 }
532 }
533 }
534 depth--;
535 } while(depth > 0);
536
537 //particles->ax[i] = ax;
538 particles->g_ax[i] = ax;
539#if DIM > 1
540 //particles->ay[i] = ay;
541 particles->g_ay[i] = ay;
542#if DIM == 3
543 //particles->az[i] = az;
544 particles->g_az[i] = az;
545#endif
546#endif
547 }
548 }
549
550 __global__ void computeForces_v1_2(Tree *tree, Particles *particles, real radius, integer n, integer m,
551 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
552 bool potentialEnergy) {
553
554 register int i, ii;
555 int child, nodeIndex, childNumber, depth;
556
557 real px, ax, dx, f, distance;
558#if DIM > 1
559 real py, ay, dy;
560#if DIM == 3
561 real pz, az, dz;
562#endif
563#endif
564 real sml;
565 real thetasq = theta*theta;
566
567 __shared__ int currentNodeIndex[MAX_DEPTH];
568 __shared__ int currentChildNumber[MAX_DEPTH];
569
570 __shared__ volatile real cellSize[MAX_DEPTH];
571
572 if (threadIdx.x == 0) {
573 cellSize[0] = 4.0 * radius * radius;
574 #pragma unroll
575 for (i = 1; i < MAX_DEPTH; i++) {
576 cellSize[i] = cellSize[i - 1] * 0.25;
577 }
578 }
579
580 __syncthreads();
581
582 for (ii = threadIdx.x + blockIdx.x * blockDim.x; ii < n; ii += blockDim.x * gridDim.x) {
583
584 i = tree->sorted[ii]; //i = ii;
585
586 px = particles->x[i];
587#if DIM > 1
588 py = particles->y[i];
589#if DIM == 3
590 pz = particles->z[i];
591#endif
592#endif
593 //particles->ax[i] = 0.0;
594 particles->g_ax[i] = 0.0;
595#if DIM > 1
596 //particles->ay[i] = 0.0;
597 particles->g_ay[i] = 0.0;
598#if DIM == 3
599 //particles->az[i] = 0.0;
600 particles->g_az[i] = 0.0;
601#endif
602#endif
603 ax = 0.0;
604#if DIM > 1
605 ay = 0.0;
606#if DIM == 3
607 az = 0.0;
608#endif
609#endif
610
611 // start at root
612 depth = 1;
613 currentNodeIndex[depth] = 0;
614 currentChildNumber[depth] = 0;
615
616 do {
617 childNumber = currentChildNumber[depth];
618 nodeIndex = currentNodeIndex[depth];
619
620 while(childNumber < POW_DIM) {
621 do {
622 child = tree->child[POW_DIM * nodeIndex + childNumber];
623 childNumber++;
624 } while(child == -1 && childNumber < POW_DIM);
625 if (child != -1 && child != i) { // dont do self-gravity with yourself!
626 dx = particles->x[child] - px;
627 distance = dx*dx + smoothing; //150329404.287723; //(0.0317 * 0.0317); //0.025;
628#if DIM > 1
629 dy = particles->y[child] - py;
630 distance += dy*dy;
631#endif
632#if DIM == 3
633 dz = particles->z[child] - pz;
634 distance += dz*dz;
635#endif
636 // if child is leaf or far away
637 if (child < n || distance * thetasq > cellSize[depth]) {
638 distance = cuda::math::sqrt(distance);
639#if SI_UNITS
640 f = Constants::G * particles->mass[child] / (distance * distance * distance);
641#else
642 f = particles->mass[child] / (distance * distance * distance);
643#endif
644
645 ax += f*dx;
646#if DIM > 1
647 ay += f*dy;
648#if DIM == 3
649 az += f*dz;
650#endif
651#endif
652 // gravitational potential energy
653 if (potentialEnergy) {
654#if SI_UNITS
655 particles->u[i] -= 0.5 * (Constants::G * particles->mass[child] * particles->mass[i])/distance;
656#else
657 particles->u[i] -= 0.5 * (particles->mass[child] * particles->mass[i])/distance;
658#endif
659 }
660 } else {
661 // put child on stack
662 currentChildNumber[depth] = childNumber;
663 currentNodeIndex[depth] = nodeIndex;
664 depth++;
665 if (depth == MAX_DEPTH) {
666 cudaTerminate("depth = %i >= MAX_DEPTH = %i\n", depth, MAX_DEPTH);
667 }
668 childNumber = 0;
669 nodeIndex = child;
670 }
671 }
672 }
673 depth--;
674 } while(depth > 0);
675
676 //particles->ax[i] = ax;
677 particles->g_ax[i] = ax;
678#if DIM > 1
679 //particles->ay[i] = ay;
680 particles->g_ay[i] = ay;
681#if DIM == 3
682 //particles->az[i] = az;
683 particles->g_az[i] = az;
684#endif
685#endif
686 }
687 }
688
689 // see: https://iss.oden.utexas.edu/Publications/Papers/burtscher11.pdf
690 /*__global__ void test() {
691 // precompute and cache info
692 // determine first thread in each warp
693 for (//sorted body indexes assigned to me) {
694 // cache body data
695 // initialize iteration stack
696 depth = 0;
697 while (depth >= 0) {
698 while (//there are more nodes to visit) {
699 if (//I’m the first thread in the warp) {
700 // move on to next node
701 // read node data and put in shared memory
702 }
703 threadfence block();
704 if (//node is not null) {
705 // get node data from shared memory
706 // compute distance to node
707 if ((//node is a body) || all(//distance >= cutoff)) {
708 // compute interaction force contribution
709 } else {
710 depth++; // descend to next tree level
711 if (//I’m the first thread in the warp) {
712 // push node’s children onto iteration stack
713 }
714 threadfence block();
715 }
716 } else {
717 depth = max(0, depth-1); // early out because remaining nodes are also null
718 }
719 }
720 depth−−;
721 }
722 // update body data
723 }
724 }*/
725
726 __global__ void computeForces_v2(Tree *tree, Particles *particles, real radius, integer n, integer m,
727 integer blockSize, integer warp, integer stackSize,
728 SubDomainKeyTree *subDomainKeyTree, real theta,
729 real smoothing, bool potentialEnergy) {
730
731 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
732 integer stride = blockDim.x*gridDim.x;
733 integer offset = 0;
734
735 register int sortedIndex;
736
737 //__shared__ real depth[stackSize * blockSize/warp];
738 //__shared__ integer stack[stackSize * blockSize/warp];
739 extern __shared__ real buffer[];
740
741 real* depth = (real*)buffer;
742 integer* stack = (integer*)&depth[stackSize * blockSize/warp];
743
744 real pos_x;
745#if DIM > 1
746 real pos_y;
747#if DIM == 3
748 real pos_z;
749#endif
750#endif
751
752 real acc_x;
753#if DIM > 1
754 real acc_y;
755#if DIM == 3
756 real acc_z;
757#endif
758#endif
759
760 // in case that one of the first children are a leaf
761 int jj = -1;
762 #pragma unroll
763 for (integer i=0; i<POW_DIM; i++) {
764 if (tree->child[i] != -1) {
765 jj++;
766 }
767 }
768
769 int counter = threadIdx.x % warp;
770 int stackStartIndex = stackSize*(threadIdx.x / warp);
771
772 while ((bodyIndex + offset) < n) {
773
774 //sortedIndex = bodyIndex + offset;
775 sortedIndex = tree->sorted[bodyIndex + offset];
776
777 pos_x = particles->x[sortedIndex];
778#if DIM > 1
779 pos_y = particles->y[sortedIndex];
780#if DIM == 3
781 pos_z = particles->z[sortedIndex];
782#endif
783#endif
784
785 acc_x = 0.0;
786#if DIM > 1
787 acc_y = 0.0;
788#if DIM == 3
789 acc_z = 0.0;
790#endif
791#endif
792
793 // initialize stack
794 integer top = jj + stackStartIndex;
795
796 if (counter == 0) {
797
798 int temp = 0;
799
800 #pragma unroll
801 for (int i=0; i<POW_DIM; i++) {
802 if (tree->child[i] != -1) {
803 stack[stackStartIndex + temp] = tree->child[i];
804 depth[stackStartIndex + temp] = radius*radius/theta;
805 temp++;
806 }
807 }
808 }
809 __syncthreads();
810
811 // while stack is not empty / more nodes to visit
812 while (top >= stackStartIndex) {
813
814 integer node = stack[top];
815
816 real dp = 0.25 * depth[top]; //powf(0.5, DIM) * depth[top]; //0.25*depth[top]; // float dp = depth[top];
817
818 for (integer i=0; i<POW_DIM; i++) {
819
820 integer ch = tree->child[POW_DIM*node + i];
821
822 //__threadfence();
823
824 if (ch >= 0) {
825
826 real dx = particles->x[ch] - pos_x;
827#if DIM > 1
828 real dy = particles->y[ch] - pos_y;
829#if DIM == 3
830 real dz = particles->z[ch] - pos_z;
831#endif
832#endif
833
834 real r = dx*dx + smoothing; // SMOOTHING
835#if DIM > 1
836 r += dy*dy;
837#if DIM == 3
838 r += dz*dz;
839#endif
840#endif
841
842 //if (ch < n /*is leaf node*/ || !__any_sync(activeMask, dp > r)) {
843 if (ch < m /*is leaf node*/ || __all_sync(__activemask(), dp <= r)) {
844
845 // calculate interaction force contribution
846 if (r > 0.f) { //NEW //TODO: how to avoid r = 0?
847 r = cuda::math::rsqrt(r);
848 }
849
850#if SI_UNITS
851 real f = Constants::G * particles->mass[ch] * r * r * r;
852#else
853 real f = particles->mass[ch] * r * r * r;
854#endif
855
856 acc_x += f*dx;
857#if DIM > 1
858 acc_y += f*dy;
859#if DIM == 3
860 acc_z += f*dz;
861#endif
862#endif
863 if (potentialEnergy) {
864#if SI_UNITS
865 particles->u[bodyIndex + offset] -= 0.5 * (Constants::G * particles->mass[ch] *
866 particles->mass[bodyIndex + offset])/cuda::math::sqrt(r);
867#else
868 particles->u[bodyIndex + offset] -= 0.5 * (particles->mass[ch] *
869 particles->mass[bodyIndex + offset])/cuda::math::sqrt(r);
870#endif
871 }
872 }
873 else {
874 // if first thread in warp: push node's children onto iteration stack
875 if (counter == 0) {
876 stack[top] = ch;
877 depth[top] = dp; // depth[top] = 0.25*dp;
878 }
879 top++; // descend to next tree level
880 __threadfence_block();
881 }
882 }
883 // this is not working
884 //else {
885 // top = cuda::math::max(stackStartIndex, top-1);
886 //}
887 }
888 top--;
889 }
890 // update body data
891 particles->g_ax[sortedIndex] = acc_x;
892#if DIM > 1
893 particles->g_ay[sortedIndex] = acc_y;
894#if DIM == 3
895 particles->g_az[sortedIndex] = acc_z;
896#endif
897#endif
898
899 offset += stride;
900 __syncthreads();
901 }
902
903 }
904
905 __global__ void computeForces_v2_1(Tree *tree, Particles *particles, integer n, integer m, integer blockSize,
906 integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree,
907 real theta, real smoothing, bool potentialEnergy) {
908
909 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
910 integer stride = blockDim.x*gridDim.x;
911 integer offset = 0;
912
913 //__shared__ real depth[stackSize * blockSize/warp];
914 //__shared__ integer stack[stackSize * blockSize/warp];
915 extern __shared__ real buffer[];
916
917 real* depth = (real*)buffer;
918 integer* stack = (integer*)&depth[stackSize * blockSize/warp];
919
920 real x_radius = 0.5*(*tree->maxX - (*tree->minX));
921#if DIM > 1
922 real y_radius = 0.5*(*tree->maxY - (*tree->minY));
923#if DIM == 3
924 real z_radius = 0.5*(*tree->maxZ - (*tree->minZ));
925#endif
926#endif
927
928#if DIM == 1
929 real radius = x_radius;
930#elif DIM == 2
931 real radius = cuda::math::max(x_radius, y_radius);
932#else
933 real radius_max = cuda::math::max(x_radius, y_radius);
934 real radius = cuda::math::max(radius_max, z_radius);
935#endif
936
937 // in case that one of the first children are a leaf
938 integer jj = -1;
939 for (integer i=0; i<POW_DIM; i++) {
940 if (tree->child[i] != -1) {
941 jj++;
942 }
943 }
944
945 integer counter = threadIdx.x % warp;
946 integer stackStartIndex = stackSize*(threadIdx.x / warp);
947
948 while ((bodyIndex + offset) < n) {
949
950 real pos_x = particles->x[bodyIndex + offset];
951#if DIM > 1
952 real pos_y = particles->y[bodyIndex + offset];
953#if DIM == 3
954 real pos_z = particles->z[bodyIndex + offset];
955#endif
956#endif
957
958 real acc_x = 0.0;
959#if DIM > 1
960 real acc_y = 0.0;
961#if DIM == 3
962 real acc_z = 0.0;
963#endif
964#endif
965
966 // initialize stack
967 integer top = jj + stackStartIndex;
968
969 if (counter == 0) {
970
971 integer temp = 0;
972
973 for (int i=0; i<POW_DIM; i++) {
974 if (tree->child[i] != -1) {
975 stack[stackStartIndex + temp] = tree->child[i];
976 depth[stackStartIndex + temp] = radius*radius/theta;
977 temp++;
978 }
979 }
980 }
981 __syncthreads();
982
983 // while stack is not empty / more nodes to visit
984 while (top >= stackStartIndex) {
985
986 integer node = stack[top];
987
988 real dp = 0.5 * depth[top]; //powf(0.5, DIM) * depth[top]; //0.25*depth[top]; // float dp = depth[top];
989
990 for (integer i=0; i<POW_DIM; i++) {
991
992 integer ch = tree->child[POW_DIM*node + i];
993
994 //__threadfence();
995
996 if (ch >= 0) {
997
998 real dx = particles->x[ch] - pos_x;
999#if DIM > 1
1000 real dy = particles->y[ch] - pos_y;
1001#if DIM == 3
1002 real dz = particles->z[ch] - pos_z;
1003#endif
1004#endif
1005
1006 real r = dx*dx + smoothing; // SMOOTHING
1007#if DIM > 1
1008 r += dy*dy;
1009#if DIM == 3
1010 r += dz*dz;
1011#endif
1012#endif
1013
1014 //if (ch < n /*is leaf node*/ || !__any_sync(activeMask, dp > r)) {
1015 if (ch < m /*is leaf node*/ || __all_sync(__activemask(), dp <= r)) {
1016
1017 // calculate interaction force contribution
1018 if (r > 0.f) { //NEW //TODO: how to avoid r = 0?
1019 r = cuda::math::rsqrt(r);
1020 }
1021
1022#if SI_UNITS
1023 real f = Constants::G * particles->mass[ch] * r * r * r;
1024#else
1025 real f = particles->mass[ch] * r * r * r;
1026#endif
1027
1028 acc_x += f*dx;
1029#if DIM > 1
1030 acc_y += f*dy;
1031#if DIM == 3
1032 acc_z += f*dz;
1033#endif
1034#endif
1035 if (potentialEnergy) {
1036#if SI_UNITS
1037 particles->u[bodyIndex + offset] -= 0.5 * (Constants::G * particles->mass[ch] *
1038 particles->mass[bodyIndex + offset])/cuda::math::sqrt(r);
1039#else
1040 particles->u[bodyIndex + offset] -= 0.5 * (particles->mass[ch] *
1041 particles->mass[bodyIndex + offset])/cuda::math::sqrt(r);
1042#endif
1043 }
1044 }
1045 else {
1046 // if first thread in warp: push node's children onto iteration stack
1047 if (counter == 0) {
1048 stack[top] = ch;
1049 depth[top] = dp; // depth[top] = 0.25*dp;
1050 }
1051 top++; // descend to next tree level
1052 //__threadfence();
1053 }
1054 }
1055 //else {
1056 // top = max(stackStartIndex, top-1);
1057 //}
1058 }
1059 top--;
1060 }
1061 // update body data
1062 particles->g_ax[bodyIndex + offset] = acc_x;
1063#if DIM > 1
1064 particles->g_ay[bodyIndex + offset] = acc_y;
1065#if DIM == 3
1066 particles->g_az[bodyIndex + offset] = acc_z;
1067#endif
1068#endif
1069 offset += stride;
1070
1071 __syncthreads();
1072 }
1073
1074 }
1075
1076 /*
1077 __global__ void computeForces_v3(Tree *tree, Particles *particles, real radius, integer n, integer m,
1078 integer blockSize, integer warp, integer stackSize,
1079 SubDomainKeyTree *subDomainKeyTree, real theta,
1080 real smoothing, bool potentialEnergy) {
1081
1082 register int i, ii, depth;
1083
1084 real x, ax, dx;
1085#if DIM > 1
1086 real y, ay, dy;
1087#if DIM == 3
1088 real z, az, dz;
1089#endif
1090#endif
1091
1092 int child;
1093 int currentNodeIndex;
1094 int currentChildNumber;
1095 int nodeIndex[MAX_DEPTH];
1096 int childNumber[MAX_DEPTH];
1097
1098
1099 real f, distance;
1100 real thetaSquared = theta * theta;
1101 real cellSize = 4 * radius * radius;
1102
1103 for (ii = threadIdx.x + blockIdx.x * blockDim.x; ii < n; ii += blockDim.x * gridDim.x) {
1104
1105 i = tree->sorted[ii];
1106
1107 x = particles->x[i];
1108#if DIM > 1
1109 y = particles->y[i];
1110#if DIM == 3
1111 z = particles->z[i];
1112#endif
1113#endif
1114
1115 ax = 0.;
1116#if DIM > 1
1117 ay = 0.;
1118#if DIM == 3
1119 az = 0.;
1120#endif
1121#endif
1122
1123 depth = 0;
1124 nodeIndex[depth] = 0;
1125 childNumber[depth] = 0;
1126
1127 do {
1128
1129 currentChildNumber = childNumber[depth];
1130 currentNodeIndex = nodeIndex[depth];
1131
1132 while (currentChildNumber < POW_DIM) {
1133 do {
1134 child = tree->child[POW_DIM * currentNodeIndex + currentChildNumber];
1135 currentChildNumber++;
1136 } while (child == -1 && currentChildNumber < POW_DIM);
1137
1138 if (child != -1 && child != i) {
1139
1140 dx = particles->x[child] - x;
1141#if DIM > 1
1142 dy = particles->y[child] - y;
1143#if DIM == 3
1144 dz = particles->z[child] - z;
1145#endif
1146#endif
1147
1148#if DIM == 1
1149 distance = dx*dx + smoothing;
1150#elif DIM == 2
1151 distance = dx*dx + dy*dy + smoothing;
1152#else
1153 distance = dx*dx + dy*dy + dz*dz + smoothing;
1154#endif
1155
1156 if (child < n || distance * thetaSquared > (pow(0.25, (real)(depth + 1)) * cellSize)) {
1157 distance = cuda::math::sqrt(distance);
1158
1159#if SI_UNITS
1160 f = Constants::G * particles->mass[child] / (distance * distance * distance);
1161#else
1162 f = particles->mass[child] / (distance * distance * distance);
1163#endif
1164
1165 ax += f*dx;
1166#if DIM > 1
1167 ay += f*dy;
1168#if DIM == 3
1169 az += f*dz;
1170#endif
1171#endif
1172
1173 } else {
1174 childNumber[depth] = currentChildNumber;
1175 nodeIndex[depth] = currentNodeIndex;
1176 depth++;
1177 currentChildNumber = 0;
1178 currentNodeIndex = child;
1179 }
1180 }
1181 }
1182 depth--;
1183
1184 } while (depth >= 0);
1185
1186 //particles->ax[i] = ax;
1187 particles->g_ax[i] = ax;
1188#if DIM > 1
1189 //particles->ay[i] = ay;
1190 particles->g_ay[i] = ay;
1191#if DIM == 3
1192 //particles->az[i] = az;
1193 particles->g_az[i] = az;
1194#endif
1195#endif
1196 }
1197
1198 }
1199 */
1200
1201 /*__global__ void computeForces_v3(Tree *tree, Particles *particles, real radius, integer numParticles, integer m,
1202 integer blockSize, integer warp, integer stackSize,
1203 SubDomainKeyTree *subDomainKeyTree, real theta,
1204 real smoothing, bool potentialEnergy) {
1205
1206 int i, j, k, n, depth, base, sbase, diff, pd, nd;
1207 float x, y, z, ax, ay, az, dx, dy, dz, tmp;
1208
1209 extern __shared__ real buffer[];
1210
1211 int* pos = (int*)buffer;
1212 int *node = (int*)&pos[stackSize * blockSize/warp];
1213 real* dq = (real*)&node[stackSize * blockSize/warp];
1214 //__shared__ volatile int pos[stackSize * blockSize/warp], node[stackSize * blockSize/warp];
1215 //__shared__ float dq[stackSize * blockSize/warp];
1216
1217 if (threadIdx.x == 0) {
1218 tmp = radius * 2;
1219 // precompute values that depend only on tree level
1220 dq[0] = tmp * tmp * 4.0;
1221 for (i = 1; i < stackSize; i++) {
1222 dq[i] = dq[i - 1] * 0.25;
1223 //dq[i - 1] += smoothing;
1224 }
1225 //dq[i - 1] += smoothing;
1226 }
1227 __syncthreads();
1228
1229 // figure out first thread in each warp (lane 0)
1230 base = threadIdx.x / warp;
1231 sbase = base * warp;
1232 j = base * stackSize;
1233
1234 diff = threadIdx.x - sbase;
1235 // make multiple copies to avoid index calculations later
1236 if (diff < stackSize) {
1237 dq[diff+j] = dq[diff];
1238 }
1239 __syncthreads();
1240
1241 // iterate over all bodies assigned to thread
1242 for (k = threadIdx.x + blockIdx.x * blockDim.x; k < numParticles; k += blockDim.x * gridDim.x) {
1243
1244 i = tree->sorted[k]; // get permuted/sorted index
1245
1246 x = particles->x[i];
1247#if DIM > 1
1248 y = particles->y[i];
1249#if DIM == 3
1250 z = particles->z[i];
1251#endif
1252#endif
1253
1254 ax = 0.;
1255 ay = 0.;
1256 az = 0.;
1257
1258 // initialize iteration stack, i.e., push root node onto stack
1259 depth = j;
1260 if (sbase == threadIdx.x) {
1261 pos[j] = 0;
1262 node[j] = m * POW_DIM; // nnodes ??
1263 }
1264
1265 do {
1266 // stack is not empty
1267 pd = pos[depth];
1268 nd = node[depth];
1269 while (pd < POW_DIM) {
1270 // node on top of stack has more children to process
1271 n = tree->child[nd + pd]; // load child pointer
1272 pd++;
1273
1274 if (n >= 0) {
1275
1276 dx = x - particles->x[n];
1277 dy = y - particles->y[n];
1278 dz = z - particles->z[n];
1279 tmp = dx*dx + (dy*dy + (dz*dz + smoothing)); // compute distance squared (plus softening)
1280 if ((n < numParticles) || __all_sync(__activemask(), tmp >= dq[depth])) { // check if all threads agree that cell is far enough away (or is a body)
1281 tmp = cuda::math::rsqrt(tmp); // compute distance
1282 tmp = particles->mass[n] * tmp * tmp * tmp;
1283 ax += dx * tmp;
1284 ay += dy * tmp;
1285 az += dz * tmp;
1286 } else {
1287 // push cell onto stack
1288 if (sbase == threadIdx.x) {
1289 pos[depth] = pd;
1290 node[depth] = nd;
1291 }
1292 depth++;
1293 pd = 0;
1294 nd = n * POW_DIM;
1295 }
1296 } else {
1297 pd = POW_DIM; // early out because all remaining children are also zero
1298 }
1299 }
1300 depth--; // done with this level
1301 } while (depth >= j);
1302
1303 //float4 acc = accVeld[i];
1304 //if (stepd > 0) {
1305 // // update velocity
1306 // float2 v = veld[i];
1307 // v.x += (ax - acc.x) * dthfd;
1308 // v.y += (ay - acc.y) * dthfd;
1309 // acc.w += (az - acc.z) * dthfd;
1310 // veld[i] = v;
1311 //}
1312
1313 // save computed acceleration
1314
1315 particles->g_ax[i] = ax;
1316#if DIM > 1
1317 particles->g_ay[i] = ay;
1318#if DIM == 3
1319 particles->g_az[i] = az;
1320#endif
1321#endif
1322 }
1323
1324 }*/
1325
1326 __global__ void preSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1327 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1328 integer n, integer m, integer relevantIndex, integer level,
1329 Curve::Type curveType) {
1330
1331
1332 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1333 int stride = blockDim.x * gridDim.x;
1334 int offset = n; // start with numParticles
1335
1336 int childIndex;
1337
1338 while ((bodyIndex + offset) < *tree->index) {
1339
1340 if (particles->level[bodyIndex + offset] <= 10) {
1341 //sendIndices[bodyIndex + offset] = 2;
1342 for (int i = 0; i < POW_DIM; i++) {
1343 childIndex = tree->child[POW_DIM * (bodyIndex + offset) + i];
1344 if (childIndex != -1 && /* not domain list node*/particles->nodeType[childIndex] == 0) {
1345 sendIndices[childIndex] = 2;
1346 }
1347 }
1348 }
1349 offset += stride;
1350 }
1351
1352 }
1353
1354 __global__ void resetSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1355 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1356 integer n, integer m, integer relevantIndex, integer level,
1357 Curve::Type curveType) {
1358
1359
1360 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1361 int stride = blockDim.x * gridDim.x;
1362 int offset = 0; // start with numParticles
1363
1364 while ((bodyIndex + offset) < *tree->index) {
1365
1366 if (sendIndices[bodyIndex + offset] != 2) {
1367 sendIndices[bodyIndex + offset] = -1;
1368 }
1369 offset += stride;
1370 }
1371
1372 }
1373
1374 __global__ void intermediateSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1375 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1376 integer n, integer m, integer relevantIndex, integer level,
1377 Curve::Type curveType) {
1378
1379 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1380 integer stride = blockDim.x * gridDim.x;
1381 integer offset = 0;
1382
1383 while ((bodyIndex + offset) < *tree->index) {
1384 if (sendIndices[bodyIndex + offset] == 2) {
1385 sendIndices[bodyIndex + offset] = 0;
1386 }
1387 if (sendIndices[bodyIndex + offset] == 3) {
1388 sendIndices[bodyIndex + offset] = 1;
1389 }
1390
1391 offset += stride;
1392 }
1393 }
1394
1395 __global__ void symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1396 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1397 integer n, integer m, integer relevantIndex, integer level,
1398 Curve::Type curveType) {
1399
1400 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1401 int stride = blockDim.x * gridDim.x;
1402 int offset = 0;
1403
1404 int particleLevel;
1405 int domainListLevel;
1406 int currentDomainListIndex;
1407 int currentParticleIndex;
1408 int childIndex;
1409
1410 int childPath;
1411 int tempChildIndex;
1412
1413 bool isDomainListNode;
1414 bool insert;
1415
1416 real min_x, max_x;
1417 real dx;
1418#if DIM > 1
1419 real min_y, max_y;
1420 real dy;
1421#if DIM == 3
1422 real min_z, max_z;
1423 real dz;
1424#endif
1425#endif
1426 real r;
1427
1428 // IDEA: sendIndices = [-1, -1, -1, ..., -1, -1]
1429 // mark to be tested indices with 2's: e.g.: sendIndices = [-1, 2, -1, ..., 2, -1]
1430 // the 2's are converted within a separate kernel to zeros (which will be tested within this kernel)
1431 // separate kernel necessary to avoid race conditions
1432 // mark to be sent indices/particles with 3's: e.g.: sendIndices = [-1, 0, 3, ..., 3, 3]
1433 // the 3's are converted within a separate kernel to ones
1434
1435 if (level == 0) { // mark first level children as starting point
1436 while ((bodyIndex + offset) < POW_DIM) {
1437 //if ((bodyIndex + offset) == 0) {
1438 // printf("symbolicForce: [rank %i] relevantDomainListIndices[%i] = %i (%f, %f, %f)\n",
1439 // subDomainKeyTree->rank,
1440 // relevantIndex, domainList->relevantDomainListIndices[relevantIndex],
1441 // particles->x[domainList->relevantDomainListIndices[relevantIndex]],
1442 // particles->y[domainList->relevantDomainListIndices[relevantIndex]],
1443 // particles->z[domainList->relevantDomainListIndices[relevantIndex]]);
1444 //}
1445 if (tree->child[bodyIndex + offset] != -1) {
1446 sendIndices[tree->child[bodyIndex + offset]] = 0;
1447 }
1448 offset += stride;
1449 }
1450 }
1451 else {
1452
1453 while ((bodyIndex + offset) < *tree->index) {
1454
1455 //if (bodyIndex + offset == 0) {
1456 // printf("[rank %i] relevantIndex = %i domainListIndex = %i (%f, %f, %f) %f\n", subDomainKeyTree->rank,
1457 // relevantIndex, domainList->relevantDomainListIndices[relevantIndex],
1458 // particles->x[domainList->relevantDomainListIndices[relevantIndex]],
1459 // particles->y[domainList->relevantDomainListIndices[relevantIndex]],
1460 // particles->z[domainList->relevantDomainListIndices[relevantIndex]],
1461 // particles->mass[domainList->relevantDomainListIndices[relevantIndex]]);
1462 //}
1463
1464 currentParticleIndex = bodyIndex + offset;
1465
1466 if ((sendIndices[currentParticleIndex] == 0 || sendIndices[currentParticleIndex] == 3) && (currentParticleIndex < n || currentParticleIndex >= m )) {
1467
1468 insert = true;
1469 isDomainListNode = false;
1470
1471 if (sendIndices[currentParticleIndex] == 0) {
1472
1473 // check whether to be inserted index corresponds to a domain list
1474 //if (insert) {
1475 //for (int i_domain = 0; i_domain < *domainList->domainListIndex; i_domain++) {
1476 // if ((bodyIndex + offset) == domainList->domainListIndices[i_domain]) {
1477 // insert = false;
1478 // isDomainListNode = true;
1479 // break;
1480 // }
1481 //}
1482 if (particles->nodeType[bodyIndex + offset] >= 1) {
1483 insert = false;
1484 isDomainListNode = true;
1485 }
1486 //}
1487 // TODO: this is probably not necessary, since only domain list indices can correspond to another process
1488 if (!isDomainListNode) {
1489 if (subDomainKeyTree->key2proc(
1490 tree->getParticleKey(particles, currentParticleIndex, MAX_LEVEL, curveType)) !=
1491 subDomainKeyTree->rank) {
1492 insert = false;
1493 //printf("Happening?\n");
1494 }
1495 }
1496
1497 if (insert) {
1498 sendIndices[currentParticleIndex] = 3;
1499 } else {
1500 sendIndices[currentParticleIndex] = -1;
1501 }
1502 }
1503
1504 // get the particle's level
1505 //particleLevel /*int tempParticleLevel*/ = tree->getTreeLevel(particles, currentParticleIndex, MAX_LEVEL, curveType);
1506 particleLevel = particles->level[currentParticleIndex];
1507 //if (tempParticleLevel != particleLevel) {
1508 // printf("%i vs %i\n", tempParticleLevel, particleLevel);
1509 //}
1510#if DEBUGGING
1511#if DIM == 3
1512 if (particleLevel < 0) {
1513 printf("WTF particleLevel = %i for %i (%e, %e, %e) (numParticlesLocal = %i, index = %i)\n",
1514 particleLevel, currentParticleIndex, particles->x[currentParticleIndex],
1515 particles->y[currentParticleIndex], particles->z[currentParticleIndex],
1516 n, *tree->index);
1517 }
1518#endif
1519#endif
1520
1521 // get the domain list node's level
1522 //domainListLevel = tree->getTreeLevel(particles,
1523 // domainList->relevantDomainListIndices[relevantIndex],
1524 // MAX_LEVEL, curveType);
1525 domainListLevel = domainList->relevantDomainListLevels[relevantIndex];
1526 currentDomainListIndex = domainList->relevantDomainListIndices[relevantIndex];
1527 //printf("domainListLevel = %i\n", domainListLevel);
1528 if (domainListLevel == -1) {
1529 cudaAssert("symbolicForce(): domainListLevel == -1 for (relevant) index: %i\n",
1530 relevantIndex);
1531 }
1532
1533 /*min_x = *tree->minX;
1534 max_x = *tree->maxX;
1535#if DIM > 1
1536 min_y = *tree->minY;
1537 max_y = *tree->maxY;
1538#if DIM == 3
1539 min_z = *tree->minZ;
1540 max_z = *tree->maxZ;
1541#endif
1542#endif
1543
1544 // determine domain list node's bounding box (in order to determine the distance)
1545 //if (domainListLevel != 1) {
1546 // printf("domainListLevel = %i\n", domainListLevel);
1547 // assert(0);
1548 //}
1549 for (int j = 0; j < domainListLevel; j++) {
1550
1551 childPath = 0;
1552 if (particles->x[currentDomainListIndex] < 0.5 * (min_x + max_x)) {
1553 childPath += 1;
1554 max_x = 0.5 * (min_x + max_x);
1555 } else {
1556 min_x = 0.5 * (min_x + max_x);
1557 }
1558#if DIM > 1
1559 if (particles->y[currentDomainListIndex] < 0.5 * (min_y + max_y)) {
1560 childPath += 2;
1561 max_y = 0.5 * (min_y + max_y);
1562 } else {
1563 min_y = 0.5 * (min_y + max_y);
1564 }
1565#if DIM == 3
1566 if (particles->z[currentDomainListIndex] < 0.5 * (min_z + max_z)) {
1567 childPath += 4;
1568 max_z = 0.5 * (min_z + max_z);
1569 } else {
1570 min_z = 0.5 * (min_z + max_z);
1571 }
1572#endif
1573#endif
1574 }*/
1575
1576 //printf("borders: %e vs %e, %e vs %e\n", min_x, domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM], max_x, domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 1]);
1577
1578 min_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM];
1579 max_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 1];
1580#if DIM > 1
1581 min_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 2];
1582 max_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 3];
1583#if DIM == 3
1584 min_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 4];
1585 max_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 5];
1586#endif
1587#endif
1588
1589 //printf("%i: borders * : (%e, %e), (%e, %e), (%e, %e)\n", relevantIndex, min_x, max_x, min_y, max_y, min_z, max_z);
1590 //printf("%i: borders: (%e, %e), (%e, %e), (%e, %e)\n", relevantIndex, domainList->borders[relevantIndex * 2 * DIM],
1591 // domainList->borders[relevantIndex * 2 * DIM + 1],
1592 // domainList->borders[relevantIndex * 2 * DIM + 2],
1593 // domainList->borders[relevantIndex * 2 * DIM + 3],
1594 // domainList->borders[relevantIndex * 2 * DIM + 4],
1595 // domainList->borders[relevantIndex * 2 * DIM + 5]);
1596
1597 // determine (smallest) distance between domain list box and (pseudo-) particle
1598 if (particles->x[currentParticleIndex] < min_x) {
1599 dx = particles->x[currentParticleIndex] - min_x;
1600 } else if (particles->x[currentParticleIndex] > max_x) {
1601 dx = particles->x[currentParticleIndex] - max_x;
1602 } else {
1603 dx = 0.;
1604 }
1605#if DIM > 1
1606 if (particles->y[currentParticleIndex] < min_y) {
1607 dy = particles->y[currentParticleIndex] - min_y;
1608 } else if (particles->y[currentParticleIndex] > max_y) {
1609 dy = particles->y[currentParticleIndex] - max_y;
1610 } else {
1611 dy = 0.;
1612 }
1613#if DIM == 3
1614 if (particles->z[currentParticleIndex] < min_z) {
1615 dz = particles->z[currentParticleIndex] - min_z;
1616 } else if (particles->z[currentParticleIndex] > max_z) { dz =
1617 particles->z[currentParticleIndex] - max_z;
1618 } else {
1619 dz = 0.;
1620 }
1621#endif
1622#endif
1623
1624#if DIM == 1
1625 r = cuda::math::sqrt(dx*dx);
1626#elif DIM == 2
1627 r = cuda::math::sqrt(dx*dx + dy*dy);
1628#else
1629 r = cuda::math::sqrt(dx*dx + dy*dy + dz*dz);
1630#endif
1631
1632 // TODO: (still?) depending on gravity force version and amount of processes: 2 * diam or 1 * diam (why?)
1633 //printf("%f >= %f (particleLevel = %i, theta = %f, r = %f)\n", powf(0.5, particleLevel-1) /* * 2*/ * diam, (theta_ * r), particleLevel, theta_, r);
1634 if (particleLevel != -1 && (((powf(0.5, particleLevel-1) /* * 2*/ * diam) >= (theta_ * r)) || isDomainListNode)) {
1635
1636 #pragma unroll
1637 for (int i = 0; i < POW_DIM; i++) {
1638
1639 //if (sendIndices[tree->child[POW_DIM * (bodyIndex + offset) + i]] != 1 && tree->child[POW_DIM * (bodyIndex + offset) + i] != -1) {
1640 // sendIndices[tree->child[POW_DIM * (bodyIndex + offset) + i]] = 2;
1641 //}
1642
1643 //if (insert && tree->child[POW_DIM * (bodyIndex + offset) + i] != -1 && particles->x[tree->child[POW_DIM * (bodyIndex + offset) + i]] == particles->x[bodyIndex + offset] &&
1644 // particles->y[tree->child[POW_DIM * (bodyIndex + offset) + i]] == particles->y[bodyIndex + offset]) {
1645 //printf("[rank %i] index = %i == child = %i ^= %i (%f, %f, %f) vs (%f, %f, %f)\n", subDomainKeyTree->rank, bodyIndex + offset, i, tree->child[POW_DIM * (bodyIndex + offset) + i],
1646 // particles->x[bodyIndex + offset], particles->y[bodyIndex + offset], particles->z[bodyIndex + offset],
1647 //
1648 // particles->x[tree->child[POW_DIM * (bodyIndex + offset) + i]], particles->y[tree->child[POW_DIM * (bodyIndex + offset) + i]], particles->z[tree->child[POW_DIM * (bodyIndex + offset) + i]]);
1649 //}
1650
1651 if (tree->child[POW_DIM * currentParticleIndex + i] != -1) {
1652 if (sendIndices[tree->child[POW_DIM * currentParticleIndex + i]] != 1) {
1653 sendIndices[tree->child[POW_DIM * currentParticleIndex + i]] = 2;
1654 }
1655 }
1656
1657 }
1658 }
1659 }
1660 __threadfence();
1661 offset += stride;
1662 }
1663 }
1664 }
1665
1666 __global__ void symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1667 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1668 integer n, integer m, integer relevantIndex, integer level,
1669 Curve::Type curveType) {
1670
1671 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1672 int stride = blockDim.x * gridDim.x;
1673 int offset = n; // start with numParticles
1674
1675 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1676
1677 real min_x, max_x;
1678 real dx;
1679#if DIM > 1
1680 real min_y, max_y;
1681 real dy;
1682#if DIM == 3
1683 real min_z, max_z;
1684 real dz;
1685#endif
1686#endif
1687 real r;
1688
1689 while ((bodyIndex + offset) < *tree->index) {
1690
1691 currentParticleIndex = bodyIndex + offset;
1692
1693 /*
1694 min_x = *tree->minX;
1695 max_x = *tree->maxX;
1696#if DIM > 1
1697 min_y = *tree->minY;
1698 max_y = *tree->maxY;
1699#if DIM == 3
1700 min_z = *tree->minZ;
1701 max_z = *tree->maxZ;
1702#endif
1703#endif
1704 */
1705
1706 particleLevel = particles->level[currentParticleIndex];
1707 domainListLevel = domainList->relevantDomainListLevels[relevantIndex];
1708 currentDomainListIndex = domainList->relevantDomainListIndices[relevantIndex];
1709
1710 /*
1711 for (int j = 0; j < domainListLevel; j++) {
1712
1713
1714 //childPath = 0;
1715 if (particles->x[currentDomainListIndex] < 0.5 * (min_x + max_x)) {
1716 //childPath += 1;
1717 max_x = 0.5 * (min_x + max_x);
1718 } else {
1719 min_x = 0.5 * (min_x + max_x);
1720 }
1721#if DIM > 1
1722 if (particles->y[currentDomainListIndex] < 0.5 * (min_y + max_y)) {
1723 //childPath += 2;
1724 max_y = 0.5 * (min_y + max_y);
1725 } else {
1726 min_y = 0.5 * (min_y + max_y);
1727 }
1728#if DIM == 3
1729 if (particles->z[currentDomainListIndex] < 0.5 * (min_z + max_z)) {
1730 //childPath += 4;
1731 max_z = 0.5 * (min_z + max_z);
1732 } else {
1733 min_z = 0.5 * (min_z + max_z);
1734 }
1735#endif
1736#endif
1737 }
1738 */
1739
1740 min_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM];
1741 max_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 1];
1742#if DIM > 1
1743 min_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 2];
1744 max_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 3];
1745#if DIM == 3
1746 min_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 4];
1747 max_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM + 5];
1748#endif
1749#endif
1750
1751 // determine (smallest) distance between domain list box and (pseudo-) particle
1752 if (particles->x[currentParticleIndex] < min_x) {
1753 dx = particles->x[currentParticleIndex] - min_x;
1754 } else if (particles->x[currentParticleIndex] > max_x) {
1755 dx = particles->x[currentParticleIndex] - max_x;
1756 } else {
1757 dx = 0.;
1758 }
1759#if DIM > 1
1760 if (particles->y[currentParticleIndex] < min_y) {
1761 dy = particles->y[currentParticleIndex] - min_y;
1762 } else if (particles->y[currentParticleIndex] > max_y) {
1763 dy = particles->y[currentParticleIndex] - max_y;
1764 } else {
1765 dy = 0.;
1766 }
1767#if DIM == 3
1768 if (particles->z[currentParticleIndex] < min_z) {
1769 dz = particles->z[currentParticleIndex] - min_z;
1770 } else if (particles->z[currentParticleIndex] > max_z) {
1771 dz = particles->z[currentParticleIndex] - max_z;
1772 } else {
1773 dz = 0.;
1774 }
1775#endif
1776#endif
1777
1778#if DIM == 1
1779 r = cuda::math::sqrt(dx*dx);
1780#elif DIM == 2
1781 r = cuda::math::sqrt(dx*dx + dy*dy);
1782#else
1783 r = cuda::math::sqrt(dx*dx + dy*dy + dz*dz);
1784#endif
1785
1786 if ((powf(0.5, particleLevel-1) /* * 2*/ * diam) >= (theta_ * r)) {
1787 #pragma unroll
1788 for (int i = 0; i < POW_DIM; i++) {
1789 childIndex = tree->child[POW_DIM * currentParticleIndex + i];
1790 if (childIndex != -1 && /* not domain list node*/particles->nodeType[childIndex] == 0) {
1791 sendIndices[childIndex] = 1;
1792 }
1793 }
1794 }
1795
1796 offset += stride;
1797 }
1798
1799 }
1800
1801 __global__ void symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1802 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1803 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
1804 Curve::Type curveType) {
1805
1806 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1807 int stride = blockDim.x * gridDim.x;
1808 int offset = n; // start with numParticles
1809
1810 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1811
1812 real min_x, max_x;
1813 real dx;
1814#if DIM > 1
1815 real min_y, max_y;
1816 real dy;
1817#if DIM == 3
1818 real min_z, max_z;
1819 real dz;
1820#endif
1821#endif
1822 real r;
1823
1824 bool added;
1825
1826 while ((bodyIndex + offset) < *tree->index) {
1827
1828 added = false;
1829
1830 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1831
1832 if (domainList->relevantDomainListProcess[relevantIndex] != relevantProc) {
1833 continue;
1834 }
1835
1836 if (added) {
1837 break;
1838 }
1839
1840
1841 currentParticleIndex = bodyIndex + offset;
1842
1843 particleLevel = particles->level[currentParticleIndex];
1844 domainListLevel = domainList->relevantDomainListLevels[relevantIndex];
1845 currentDomainListIndex = domainList->relevantDomainListIndices[relevantIndex];
1846
1847
1848 min_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM];
1849 max_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1850 1];
1851#if DIM > 1
1852 min_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1853 2];
1854 max_y = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1855 3];
1856#if DIM == 3
1857 min_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1858 4];
1859 max_z = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1860 5];
1861#endif
1862#endif
1863
1864 // determine (smallest) distance between domain list box and (pseudo-) particle
1865 if (particles->x[currentParticleIndex] < min_x) {
1866 dx = particles->x[currentParticleIndex] - min_x;
1867 } else if (particles->x[currentParticleIndex] > max_x) {
1868 dx = particles->x[currentParticleIndex] - max_x;
1869 } else {
1870 dx = 0.;
1871 }
1872#if DIM > 1
1873 if (particles->y[currentParticleIndex] < min_y) {
1874 dy = particles->y[currentParticleIndex] - min_y;
1875 } else if (particles->y[currentParticleIndex] > max_y) {
1876 dy = particles->y[currentParticleIndex] - max_y;
1877 } else {
1878 dy = 0.;
1879 }
1880#if DIM == 3
1881 if (particles->z[currentParticleIndex] < min_z) {
1882 dz = particles->z[currentParticleIndex] - min_z;
1883 } else if (particles->z[currentParticleIndex] > max_z) {
1884 dz = particles->z[currentParticleIndex] - max_z;
1885 } else {
1886 dz = 0.;
1887 }
1888#endif
1889#endif
1890
1891#if DIM == 1
1892 r = cuda::math::sqrt(dx*dx);
1893#elif DIM == 2
1894 r = cuda::math::sqrt(dx*dx + dy*dy);
1895#else
1896 r = cuda::math::sqrt(dx * dx + dy * dy + dz * dz);
1897#endif
1898
1899 if ((powf(0.5, particleLevel - 1) /* * 2*/ * diam) >= (theta_ * r)) {
1900 added = true;
1901 #pragma unroll
1902 for (int i = 0; i < POW_DIM; i++) {
1903 childIndex = tree->child[POW_DIM * currentParticleIndex + i];
1904 if (childIndex != -1 && /* not domain list node*/particles->nodeType[childIndex] == 0) {
1905 sendIndices[childIndex] = 1;
1906 }
1907 }
1908 }
1909 }
1910
1911 offset += stride;
1912 }
1913
1914 }
1915
1916 __global__ void symbolicForce_test3(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
1917 DomainList *domainList, integer *sendIndices, real diam, real theta_,
1918 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
1919 Curve::Type curveType) {
1920
1921 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1922 int stride = blockDim.x * gridDim.x;
1923 int offset = n; // start with numParticles
1924
1925 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1926
1927 real min_x, max_x;
1928 real dx;
1929#if DIM > 1
1930 real min_y, max_y;
1931 real dy;
1932#if DIM == 3
1933 real min_z, max_z;
1934 real dz;
1935#endif
1936#endif
1937 real r;
1938
1939 bool added;
1940
1941 while ((bodyIndex + offset) < *tree->index) {
1942
1943 added = false;
1944
1945 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1946
1947 if (domainList->relevantDomainListProcess[relevantIndex] != relevantProc) {
1948 continue;
1949 }
1950
1951 if (added) {
1952 break;
1953 }
1954
1955
1956 currentParticleIndex = bodyIndex + offset;
1957 domainListLevel = domainList->relevantDomainListLevels[relevantIndex];
1958 particleLevel = particles->level[currentParticleIndex];
1959 currentDomainListIndex = domainList->relevantDomainListIndices[relevantIndex];
1960
1961 if (particleLevel <= 6) {
1962 r = 0.;
1963 }
1964 else {
1965
1966 min_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 *
1967 DIM];
1968 max_x = domainList->borders[
1969 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1970 1];
1971#if DIM > 1
1972 min_y = domainList->borders[
1973 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1974 2];
1975 max_y = domainList->borders[
1976 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1977 3];
1978#if DIM == 3
1979 min_z = domainList->borders[
1980 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1981 4];
1982 max_z = domainList->borders[
1983 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
1984 5];
1985#endif
1986#endif
1987
1988 // determine (smallest) distance between domain list box and (pseudo-) particle
1989 if (particles->x[currentParticleIndex] < min_x) {
1990 dx = particles->x[currentParticleIndex] - min_x;
1991 } else if (particles->x[currentParticleIndex] > max_x) {
1992 dx = particles->x[currentParticleIndex] - max_x;
1993 } else {
1994 dx = 0.;
1995 }
1996#if DIM > 1
1997 if (particles->y[currentParticleIndex] < min_y) {
1998 dy = particles->y[currentParticleIndex] - min_y;
1999 } else if (particles->y[currentParticleIndex] > max_y) {
2000 dy = particles->y[currentParticleIndex] - max_y;
2001 } else {
2002 dy = 0.;
2003 }
2004#if DIM == 3
2005 if (particles->z[currentParticleIndex] < min_z) {
2006 dz = particles->z[currentParticleIndex] - min_z;
2007 } else if (particles->z[currentParticleIndex] > max_z) {
2008 dz = particles->z[currentParticleIndex] - max_z;
2009 } else {
2010 dz = 0.;
2011 }
2012#endif
2013#endif
2014
2015#if DIM == 1
2016 r = cuda::math::sqrt(dx*dx);
2017#elif DIM == 2
2018 r = cuda::math::sqrt(dx*dx + dy*dy);
2019#else
2020 r = cuda::math::sqrt(dx * dx + dy * dy + dz * dz);
2021#endif
2022 }
2023
2024 if ((powf(0.5, particleLevel - 1) /* * 2*/ * diam) >= (theta_ * r)) {
2025 added = true;
2026#pragma unroll
2027 for (int i = 0; i < POW_DIM; i++) {
2028 childIndex = tree->child[POW_DIM * currentParticleIndex + i];
2029 if (childIndex != -1 && /* not domain list node*/particles->nodeType[childIndex] == 0) {
2030 sendIndices[childIndex] = 1;
2031 }
2032 }
2033 }
2034 }
2035
2036 offset += stride;
2037 }
2038
2039 }
2040
2041 __global__ void symbolicForce_test4(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2042 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2043 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
2044 Curve::Type curveType) {
2045
2046 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2047 int stride = blockDim.x * gridDim.x;
2048 int offset = n; // start with numParticles
2049
2050 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex, currentProc;
2051
2052 real min_x, max_x;
2053 real dx;
2054#if DIM > 1
2055 real min_y, max_y;
2056 real dy;
2057#if DIM == 3
2058 real min_z, max_z;
2059 real dz;
2060#endif
2061#endif
2062 real r;
2063
2064 //bool added;
2065 int added = 0;
2066
2067 while ((bodyIndex + offset) < *tree->index) {
2068
2069 //added = false;
2070
2071 for (int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
2072
2073 //if (domainList->relevantDomainListProcess[relevantIndex] != relevantProc) {
2074 // continue;
2075 //}
2076
2077 //if (domainList->relevantDomainListProcess[relevantIndex] == subDomainKeyTree->rank) {
2078 // break;
2079 //}
2080
2081 currentProc = domainList->relevantDomainListProcess[relevantIndex];
2082
2083 //if ((sendIndices[childIndex] >> currentProc) & 1) {
2084 // continue;
2085 //}
2086
2087 if ((added >> currentProc) & 1) {
2088 continue;
2089 }
2090
2091
2092 currentParticleIndex = bodyIndex + offset;
2093 domainListLevel = domainList->relevantDomainListLevels[relevantIndex];
2094 particleLevel = particles->level[currentParticleIndex];
2095 currentDomainListIndex = domainList->relevantDomainListIndices[relevantIndex];
2096
2097 if (particleLevel <= 6) {
2098 r = 0.;
2099 }
2100 else {
2101
2102 min_x = domainList->borders[domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 *
2103 DIM];
2104 max_x = domainList->borders[
2105 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
2106 1];
2107#if DIM > 1
2108 min_y = domainList->borders[
2109 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
2110 2];
2111 max_y = domainList->borders[
2112 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
2113 3];
2114#if DIM == 3
2115 min_z = domainList->borders[
2116 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
2117 4];
2118 max_z = domainList->borders[
2119 domainList->relevantDomainListOriginalIndex[relevantIndex] * 2 * DIM +
2120 5];
2121#endif
2122#endif
2123
2124 // determine (smallest) distance between domain list box and (pseudo-) particle
2125 if (particles->x[currentParticleIndex] < min_x) {
2126 dx = particles->x[currentParticleIndex] - min_x;
2127 } else if (particles->x[currentParticleIndex] > max_x) {
2128 dx = particles->x[currentParticleIndex] - max_x;
2129 } else {
2130 dx = 0.;
2131 }
2132#if DIM > 1
2133 if (particles->y[currentParticleIndex] < min_y) {
2134 dy = particles->y[currentParticleIndex] - min_y;
2135 } else if (particles->y[currentParticleIndex] > max_y) {
2136 dy = particles->y[currentParticleIndex] - max_y;
2137 } else {
2138 dy = 0.;
2139 }
2140#if DIM == 3
2141 if (particles->z[currentParticleIndex] < min_z) {
2142 dz = particles->z[currentParticleIndex] - min_z;
2143 } else if (particles->z[currentParticleIndex] > max_z) {
2144 dz = particles->z[currentParticleIndex] - max_z;
2145 } else {
2146 dz = 0.;
2147 }
2148#endif
2149#endif
2150
2151#if DIM == 1
2152 r = cuda::math::sqrt(dx*dx);
2153#elif DIM == 2
2154 r = cuda::math::sqrt(dx*dx + dy*dy);
2155#else
2156 r = cuda::math::sqrt(dx * dx + dy * dy + dz * dz);
2157#endif
2158 }
2159
2160 if ((powf(0.5, particleLevel - 1) /* * 2*/ * diam) >= (theta_ * r)) {
2161 //added = true;
2162 added = added | (1 << currentProc);
2163#pragma unroll
2164 for (int i = 0; i < POW_DIM; i++) {
2165 childIndex = tree->child[POW_DIM * currentParticleIndex + i];
2166 if (childIndex != -1 && /* not domain list node*/particles->nodeType[childIndex] == 0) {
2167 sendIndices[childIndex] = sendIndices[childIndex] | (1 << currentProc);
2168 }
2169 }
2170 }
2171 }
2172
2173 added = 0;
2174 offset += stride;
2175 }
2176
2177 }
2178
2179
2180 __global__ void compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2181 DomainList *domainList, Curve::Type curveType) {
2182
2183 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2184 integer stride = blockDim.x * gridDim.x;
2185 integer offset = 0;
2186 integer bodyIndex;
2187 keyType key, hilbert;
2188 integer domainIndex;
2189 integer proc;
2190
2191 //"loop" over domain list nodes
2192 while ((index + offset) < *domainList->domainListIndex) {
2193
2194 bodyIndex = domainList->domainListIndices[index + offset];
2195 //calculate key
2196 //TODO: why not domainList->domainListKeys[index + offset] instead of getParticleKey()?
2197 //key = domainList->domainListKeys[index + offset]; //???
2198 //hilbert = KeyNS::lebesgue2hilbert(key, 21);
2199 key = tree->getParticleKey(particles, bodyIndex, MAX_LEVEL, curveType); // working version
2200 //if domain list node belongs to other process: add to relevant domain list indices
2201 proc = subDomainKeyTree->key2proc(key);
2202
2203 //printf("[rank %i] potential relevant domain list node: %i (%f, %f, %f)\n", subDomainKeyTree->rank,
2204 // bodyIndex, particles->x[bodyIndex],
2205 // particles->y[bodyIndex], particles->z[bodyIndex]);
2206
2207 // TODO: part of unit testing ...
2208 /*
2209 if (particles->mass[bodyIndex] == 0.) {
2210 //printf("Masses ... Domain index: %i mass = %e x = (%e, %e, %e)!\n", bodyIndex,
2211 // particles->mass[bodyIndex], particles->x[bodyIndex], particles->y[bodyIndex],
2212 // particles->z[bodyIndex]);
2213 for (int child=0; child<POW_DIM; child++) {
2214 if (tree->child[POW_DIM * bodyIndex + child] != -1 && particles->mass[tree->child[POW_DIM * bodyIndex + child]] > 0.) {
2215 printf("Masses ... Domain index: %i (type: %i) mass = %e x = (%e, %e, %e) but child %i not zero (mass = %e x = (%e, %e, %e))!!!\n", bodyIndex, particles->nodeType[bodyIndex],
2216 particles->mass[bodyIndex], particles->x[bodyIndex], particles->y[bodyIndex],
2217 particles->z[bodyIndex], child, particles->mass[tree->child[POW_DIM * bodyIndex + child]],
2218 particles->x[tree->child[POW_DIM * bodyIndex + child]], particles->y[tree->child[POW_DIM * bodyIndex + child]],
2219 particles->z[tree->child[POW_DIM * bodyIndex + child]]);
2220 }
2221 }
2222 }
2223 */
2224
2225 if (proc != subDomainKeyTree->rank && proc >= 0 && particles->mass[bodyIndex] > 0.f) {
2226 //printf("[rank = %i] proc = %i, key = %lu for x = (%f, %f, %f)\n", subDomainKeyTree->rank, proc, key, particles->x[bodyIndex], particles->y[bodyIndex], particles->z[bodyIndex]);
2227 domainIndex = atomicAdd(domainList->domainListCounter, 1);
2228 domainList->relevantDomainListIndices[domainIndex] = bodyIndex;
2229 domainList->relevantDomainListLevels[domainIndex] = domainList->domainListLevels[index + offset];
2230 domainList->relevantDomainListProcess[domainIndex] = proc;
2231 domainList->relevantDomainListOriginalIndex[domainIndex] = index + offset;
2232
2233 //printf("[rank %i] Adding relevant domain list node: %i (%f, %f, %f)\n", subDomainKeyTree->rank,
2234 // bodyIndex, particles->x[bodyIndex],
2235 // particles->y[bodyIndex], particles->z[bodyIndex]);
2236 }
2237 offset += stride;
2238 }
2239
2240 }
2241
2242
2243 __global__ void insertReceivedPseudoParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2244 integer *levels, int level, int n, int m) {
2245
2246 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2247 integer stride = blockDim.x * gridDim.x;
2248 integer offset;
2249
2250 integer childPath;
2251 integer temp;
2252
2253 integer insertionLevel;
2254
2255 real min_x, max_x;
2256#if DIM > 1
2257 real min_y, max_y;
2258#if DIM == 3
2259 real min_z, max_z;
2260#endif
2261#endif
2262
2263 // debug
2264 //if (bodyIndex == 0 && level == 0) {
2265 // integer levelCounter;
2266 // for (int debugLevel = 0; debugLevel< MAX_LEVEL; debugLevel++) {
2267 // levelCounter = 0;
2268 // for (int i = 0; i < (tree->toDeleteNode[1] - tree->toDeleteNode[0]); i++) {
2269 // if (debugLevel == 0) {
2270 // if (subDomainKeyTree->key2proc(tree->getParticleKey(particles, tree->toDeleteNode[0] + i, MAX_LEVEL, Curve::lebesgue)) == subDomainKeyTree->rank) {
2271 // printf("\n-------------------------------------------------\nATTENTION\n\n-------------------------------------------------\n");
2272 // }
2273 // //if (particles->x[tree->toDeleteNode[0] + i] == 0.f && particles->y[tree->toDeleteNode[0] + i] == 0.f &&
2274 // // particles->z[tree->toDeleteNode[0] + i] == 0.f) {
2275 // // printf("\n-------------------------------------------------\nATTENTION\n\n-------------------------------------------------\n");
2276 // //}
2277 // //printf("[rank %i] index = %i level = %i x = (%f, %f, %f) m = %f\n", subDomainKeyTree->rank,
2278 // // tree->toDeleteNode[0] + i,
2279 // // levels[i],
2280 // // particles->x[tree->toDeleteNode[0] + i],
2281 // // particles->y[tree->toDeleteNode[0] + i],
2282 // // particles->z[tree->toDeleteNode[0] + i],
2283 // // particles->mass[tree->toDeleteNode[0] + i]);
2284 // }
2285 // if (levels[i] == debugLevel) {
2286 // //printf("[rank %i] level available: %i\n", subDomainKeyTree->rank, debugLevel);
2287 // levelCounter++;
2288 // }
2289 // }
2290 // if (levelCounter > 0) {
2291 // printf("[rank %i] level available: %i (# = %i)\n", subDomainKeyTree->rank, debugLevel, levelCounter);
2292 // }
2293 // }
2294 //}
2295
2296 offset = 0;
2297 while ((bodyIndex + offset) < (tree->toDeleteNode[1] - tree->toDeleteNode[0])) {
2298
2299 insertionLevel = 0;
2300
2301 //if (levels[bodyIndex + offset] < 0 || levels[bodyIndex + offset] > 21) {
2302 // printf("[rank %i] levels[%i] = %i!\n", subDomainKeyTree->rank, bodyIndex + offset, levels[bodyIndex + offset]);
2303 // assert(0);
2304 //}
2305
2306 if (levels[bodyIndex + offset] == level) {
2307
2308 min_x = *tree->minX;
2309 max_x = *tree->maxX;
2310#if DIM > 1
2311 min_y = *tree->minY;
2312 max_y = *tree->maxY;
2313#if DIM == 3
2314 min_z = *tree->minZ;
2315 max_z = *tree->maxZ;
2316#endif
2317#endif
2318 temp = 0;
2319 childPath = 0;
2320
2321 // find insertion point for body
2322 if (particles->x[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_x + max_x)) { // x direction
2323 childPath += 1;
2324 max_x = 0.5 * (min_x + max_x);
2325 }
2326 else {
2327 min_x = 0.5 * (min_x + max_x);
2328 }
2329#if DIM > 1
2330 if (particles->y[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_y + max_y)) { // y direction
2331 childPath += 2;
2332 max_y = 0.5 * (min_y + max_y);
2333 }
2334 else {
2335 min_y = 0.5 * (min_y + max_y);
2336 }
2337#if DIM == 3
2338 if (particles->z[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_z + max_z)) { // z direction
2339 childPath += 4;
2340 max_z = 0.5 * (min_z + max_z);
2341 }
2342 else {
2343 min_z = 0.5 * (min_z + max_z);
2344 }
2345#endif
2346#endif
2347 int childIndex = tree->child[temp*POW_DIM + childPath];
2348 //atomicAdd(&tree->count[childIndex], 1);
2349 insertionLevel++;
2350
2351 // debug
2352 //if (subDomainKeyTree->rank == 0) {
2353 // if (childPath < 4) {
2354 // printf("[rank %i] childPath = %i WTF?\n", subDomainKeyTree->rank, childPath);
2355 // }
2356 //}
2357 //else {
2358 // if (childPath >= 4) {
2359 // printf("[rank %i] childPath = %i WTF?\n", subDomainKeyTree->rank, childPath);
2360 // }
2361 //}
2362 // end: debug
2363
2364 // debug
2365 //if ((bodyIndex + offset) % 100 == 0) {
2366 // printf("[rank %i] childPath = %i, childIndex = %i\n", subDomainKeyTree->rank, childPath,
2367 // childIndex);
2368 //}
2369 // end: debug
2370
2371 // traverse tree until hitting leaf node
2372 while (childIndex >= m) {
2373 insertionLevel++;
2374
2375 temp = childIndex;
2376 childPath = 0;
2377
2378 // find insertion point for body
2379 if (particles->x[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_x + max_x)) { // x direction
2380 childPath += 1;
2381 max_x = 0.5 * (min_x + max_x);
2382 }
2383 else {
2384 min_x = 0.5 * (min_x + max_x);
2385 }
2386#if DIM > 1
2387 if (particles->y[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_y + max_y)) { // y direction
2388 childPath += 2;
2389 max_y = 0.5 * (min_y + max_y);
2390 }
2391 else {
2392 min_y = 0.5 * (min_y + max_y);
2393 }
2394#if DIM == 3
2395 if (particles->z[tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_z + max_z)) { // z direction
2396 childPath += 4;
2397 max_z = 0.5 * (min_z + max_z);
2398 }
2399 else {
2400 min_z = 0.5 * (min_z + max_z);
2401 }
2402#endif
2403#endif
2404 //atomicAdd(&tree->count[temp], 1); // ? do not count, since particles are just temporarily saved on this process
2405 childIndex = tree->child[POW_DIM*temp + childPath];
2406
2407 }
2408#if DIM == 3
2409 if (childIndex != -1) {
2410 printf("[rank %i] (%f, %f, %f) vs (%f, %f, %f)\n", subDomainKeyTree->rank,
2411 particles->x[tree->toDeleteNode[0] + bodyIndex + offset],
2412 particles->y[tree->toDeleteNode[0] + bodyIndex + offset],
2413 particles->z[tree->toDeleteNode[0] + bodyIndex + offset],
2414 particles->x[childIndex],
2415 particles->y[childIndex],
2416 particles->z[childIndex]);
2417 cudaAssert("insertReceivedPseudoParticles(): childIndex = %i temp = %i\n", childIndex, temp);
2418 }
2419#endif
2420
2421 //insertionLevel++;
2422
2423 //temp = childIndex;
2424 tree->child[POW_DIM*temp + childPath] = tree->toDeleteNode[0] + bodyIndex + offset;
2425 //printf("[rank %i] gravity inserting POWDIM * %i + %i = %i (level = %i)\n", subDomainKeyTree->rank,
2426 // temp, childPath, tree->toDeleteNode[0] + bodyIndex + offset, level);
2427
2428 if (levels[bodyIndex + offset] != insertionLevel) {
2429 // debug
2430 //printf("[rank %i] index = %i childIndex = %i level = %i insertionLevel = %i path = %i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i\n",
2431 // subDomainKeyTree->rank, tree->toDeleteNode[0] + bodyIndex + offset, childIndex,
2432 // levels[bodyIndex + offset], insertionLevel, path[0], path[1], path[2], path[3], path[4],
2433 // path[5], path[6], path[7], path[8], path[9], path[10]);
2434 //printf("[rank %i] level = %i, insertionLevel = %i x = (%f, %f, %f) min/max = (%f, %f | %f, %f | %f, %f))\n", subDomainKeyTree->rank,
2435 // levels[bodyIndex + offset], insertionLevel,
2436 // particles->x[tree->toDeleteNode[0] + bodyIndex + offset],
2437 // particles->y[tree->toDeleteNode[0] + bodyIndex + offset],
2438 // particles->z[tree->toDeleteNode[0] + bodyIndex + offset],
2439 // min_x, max_x, min_y, max_y, min_z, max_z);
2440 //printf("[rank %i] level = %i, insertionLevel = %i x = (%f, %f, %f) min/max = (%f, %f, %f))\n", subDomainKeyTree->rank,
2441 // levels[bodyIndex + offset], insertionLevel,
2442 // particles->x[tree->toDeleteNode[0] + bodyIndex + offset],
2443 // particles->y[tree->toDeleteNode[0] + bodyIndex + offset],
2444 // particles->z[tree->toDeleteNode[0] + bodyIndex + offset],
2445 // 0.5 * (min_x + max_x), 0.5 * (min_y + max_y), 0.5 * (min_z + max_z));
2446 //for (int i=0; i < (tree->toDeleteNode[1] - tree->toDeleteNode[0]); i++) {
2447 // printf("[rank %i] index = %i level = %i x = (%f, %f, %f) m = %f\n",
2448 // subDomainKeyTree->rank,
2449 // tree->toDeleteNode[0] + i,
2450 // levels[i],
2451 // particles->x[tree->toDeleteNode[0] + i],
2452 // particles->y[tree->toDeleteNode[0] + i],
2453 // particles->z[tree->toDeleteNode[0] + i],
2454 // particles->mass[tree->toDeleteNode[0] + i]);
2455 //}
2456
2457 cudaAssert("insertReceivedPseudoParticles() for %i: level[%i] = %i != insertionLevel = %i!\n",
2458 tree->toDeleteNode[0] + bodyIndex + offset, bodyIndex + offset,
2459 levels[bodyIndex + offset], insertionLevel);
2460 }
2461 }
2462 __threadfence();
2463 offset += stride;
2464 }
2465 }
2466
2467 __global__ void insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2468 DomainList *domainList, DomainList *lowestDomainList, int n, int m) {
2469
2470 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2471 integer stride = blockDim.x * gridDim.x;
2472
2473 integer offset;
2474
2475 real min_x, max_x;
2476#if DIM > 1
2477 real min_y, max_y;
2478#if DIM == 3
2479 real min_z, max_z;
2480#endif
2481#endif
2482 integer childPath;
2483 integer temp;
2484
2485 offset = 0;
2486
2487 bodyIndex += tree->toDeleteLeaf[0];
2488
2489 while ((bodyIndex + offset) < tree->toDeleteLeaf[1]) { // && (bodyIndex + offset) >= tree->toDeleteLeaf[0]) {
2490
2491 min_x = *tree->minX;
2492 max_x = *tree->maxX;
2493#if DIM > 1
2494 min_y = *tree->minY;
2495 max_y = *tree->maxY;
2496#if DIM == 3
2497 min_z = *tree->minZ;
2498 max_z = *tree->maxZ;
2499#endif
2500#endif
2501
2502 temp = 0;
2503 childPath = 0;
2504
2505 // find insertion point for body
2506 if (particles->x[bodyIndex + offset] < 0.5 * (min_x + max_x)) { // x direction
2507 childPath += 1;
2508 max_x = 0.5 * (min_x + max_x);
2509 }
2510 else {
2511 min_x = 0.5 * (min_x + max_x);
2512 }
2513#if DIM > 1
2514 if (particles->y[bodyIndex + offset] < 0.5 * (min_y + max_y)) { // y direction
2515 childPath += 2;
2516 max_y = 0.5 * (min_y + max_y);
2517 }
2518 else {
2519 min_y = 0.5 * (min_y + max_y);
2520 }
2521#if DIM == 3
2522 if (particles->z[bodyIndex + offset] < 0.5 * (min_z + max_z)) { // z direction
2523 childPath += 4;
2524 max_z = 0.5 * (min_z + max_z);
2525 }
2526 else {
2527 min_z = 0.5 * (min_z + max_z);
2528 }
2529#endif
2530#endif
2531 int childIndex = tree->child[temp*POW_DIM + childPath];
2532
2533 // traverse tree until hitting leaf node
2534 while (childIndex >= m) {
2535
2536 temp = childIndex;
2537
2538 childPath = 0;
2539
2540 // find insertion point for body
2541 if (particles->x[bodyIndex + offset] < 0.5 * (min_x + max_x)) { // x direction
2542 childPath += 1;
2543 max_x = 0.5 * (min_x + max_x);
2544 }
2545 else {
2546 min_x = 0.5 * (min_x + max_x);
2547 }
2548#if DIM > 1
2549 if (particles->y[bodyIndex + offset] < 0.5 * (min_y + max_y)) { // y direction
2550 childPath += 2;
2551 max_y = 0.5 * (min_y + max_y);
2552 }
2553 else {
2554 min_y = 0.5 * (min_y + max_y);
2555 }
2556#if DIM == 3
2557 if (particles->z[bodyIndex + offset] < 0.5 * (min_z + max_z)) { // z direction
2558 childPath += 4;
2559 max_z = 0.5 * (min_z + max_z);
2560 }
2561 else {
2562 min_z = 0.5 * (min_z + max_z);
2563 }
2564#endif
2565#endif
2566 //atomicAdd(&tree->count[temp], 1); // do not count, since particles are just temporarily saved on this process
2567 childIndex = tree->child[POW_DIM*temp + childPath];
2568
2569 }
2570
2571 if (childIndex != -1) {
2572 cudaAssert("ATTENTION: insertReceivedParticles(): childIndex = %i (%i, %i) (%i, %i)\n", childIndex,
2573 tree->toDeleteLeaf[0], tree->toDeleteLeaf[1], tree->toDeleteNode[0],
2574 tree->toDeleteNode[1]);
2575 //printf("[rank %i] ATTENTION: childIndex = %i,... child[8 * %i + %i] = %i (%f, %f, %f) vs (%f, %f, %f)\n", subDomainKeyTree->rank,
2576 // childIndex, temp, childPath, bodyIndex + offset,
2577 // particles->x[childIndex], particles->y[childIndex], particles->z[childIndex],
2578 // particles->x[bodyIndex + offset], particles->y[bodyIndex + offset], particles->z[bodyIndex + offset]);
2579
2580 }
2581
2582 tree->child[POW_DIM*temp + childPath] = bodyIndex + offset;
2583
2584 __threadfence();
2585 offset += stride;
2586 }
2587
2588 }
2589
2590 real Launch::collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices,
2591 integer *particles2Send, integer *pseudoParticles2Send,
2592 integer *pseudoParticlesLevel,
2593 integer *particlesCount, integer *pseudoParticlesCount,
2594 integer n, integer length, Curve::Type curveType) {
2595 ExecutionPolicy executionPolicy;
2596 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::collectSendIndices, tree, particles, sendIndices,
2597 particles2Send, pseudoParticles2Send, pseudoParticlesLevel, particlesCount,
2598 pseudoParticlesCount, n, length, curveType);
2599 }
2600
2601 real Launch::collectSendIndices_test4(Tree *tree, Particles *particles, integer *sendIndices,
2602 integer *particles2Send, integer *pseudoParticles2Send,
2603 integer *pseudoParticlesLevel,
2604 integer *particlesCount, integer *pseudoParticlesCount,
2605 integer numParticlesLocal, integer numParticles,
2606 integer treeIndex, int currentProc, Curve::Type curveType) {
2607 ExecutionPolicy executionPolicy;
2608 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::collectSendIndices_test4, tree, particles, sendIndices,
2609 particles2Send, pseudoParticles2Send, pseudoParticlesLevel, particlesCount,
2610 pseudoParticlesCount, numParticlesLocal, numParticles, treeIndex, currentProc,
2611 curveType);
2612 }
2613
2614 real Launch::testSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2615 integer *sendIndices, integer *markedSendIndices,
2616 integer *levels, Curve::Type curveType, integer length) {
2617 ExecutionPolicy executionPolicy;
2618 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::testSendIndices, subDomainKeyTree,
2619 tree, particles, sendIndices, markedSendIndices, levels, curveType, length);
2620 }
2621
2622 real Launch::computeForces_v1(Tree *tree, Particles *particles, real radius, integer n, integer m,
2623 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
2624 bool potentialEnergy) {
2625 size_t sharedMemory = sizeof(real) * MAX_DEPTH;
2626 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2627 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2628 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v1, tree, particles,
2629 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2630 }
2631
2632 real Launch::computeForces_v1_1(Tree *tree, Particles *particles, real radius, integer n, integer m,
2633 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
2634 bool potentialEnergy) {
2635 size_t sharedMemory = sizeof(real) * MAX_DEPTH;
2636 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2637 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2638 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v1_1, tree, particles,
2639 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2640 }
2641
2642 real Launch::computeForces_v1_2(Tree *tree, Particles *particles, real radius, integer n, integer m,
2643 SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing,
2644 bool potentialEnergy) {
2645 size_t sharedMemory = (2*sizeof(int) + sizeof(real)) * MAX_DEPTH;
2646 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2647 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2648 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v1_2, tree, particles,
2649 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2650 }
2651
2652 real Launch::computeForces_v2(Tree *tree, Particles *particles, real radius, integer n, integer m,
2653 integer blockSize, integer warp, integer stackSize,
2654 SubDomainKeyTree *subDomainKeyTree, real theta,
2655 real smoothing, bool potentialEnergy) {
2656
2657 size_t sharedMemory = (sizeof(real)+sizeof(integer))*stackSize*blockSize/warp;
2658 //size_t sharedMemory = 2*sizeof(real)*stackSize*blockSize/warp;
2659 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2660 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2661 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v2, tree, particles, radius,
2662 n, m, blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2663 }
2664
2665 real Launch::computeForces_v2_1(Tree *tree, Particles *particles, integer n, integer m, integer blockSize,
2666 integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree,
2667 real theta, real smoothing, bool potentialEnergy) {
2668
2669 size_t sharedMemory = (sizeof(real)+sizeof(integer))*stackSize*blockSize/warp;
2670 //size_t sharedMemory = 2*sizeof(real)*stackSize*blockSize/warp;
2671 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2672 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2673 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v2_1, tree, particles, n, m,
2674 blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2675 }
2676
2677 real Launch::computeForces_v3(Tree *tree, Particles *particles, real radius, integer n, integer m,
2678 integer blockSize, integer warp, integer stackSize,
2679 SubDomainKeyTree *subDomainKeyTree, real theta,
2680 real smoothing, bool potentialEnergy) {
2681 //size_t sharedMemory = sizeof(real) * MAX_DEPTH;
2682 size_t sharedMemory = (sizeof(real)+sizeof(integer))*stackSize*blockSize/warp;
2683 ExecutionPolicy executionPolicy(256, 256, sharedMemory);
2684 //ExecutionPolicy executionPolicy(512, 256, sharedMemory);
2685 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::computeForces_v2_1, tree, particles, n, m,
2686 blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2687 }
2688
2689 //real Launch::symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2690 // DomainList *domainList, integer *sendIndices,
2691 // real diam, real theta_, integer n, integer m, integer relevantIndex,
2692 // Curve::Type curveType) {
2693 // ExecutionPolicy executionPolicy(1, 256);
2694 // return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce, subDomainKeyTree, tree,
2695 // particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, curveType);
2696 //}
2697
2698 real Launch::intermediateSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2699 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2700 integer n, integer m, integer relevantIndex, integer level,
2701 Curve::Type curveType) {
2702 ExecutionPolicy executionPolicy;
2703 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::intermediateSymbolicForce, subDomainKeyTree, tree,
2704 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2705 }
2706
2707 real Launch::symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2708 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2709 integer n, integer m, integer relevantIndex, integer level,
2710 Curve::Type curveType) {
2711 ExecutionPolicy executionPolicy;
2712 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce, subDomainKeyTree, tree,
2713 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2714 }
2715
2716 real Launch::symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2717 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2718 integer n, integer m, integer relevantIndex, integer level,
2719 Curve::Type curveType) {
2720 ExecutionPolicy executionPolicy;
2721 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce_test, subDomainKeyTree, tree,
2722 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2723 }
2724
2725 real Launch::symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2726 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2727 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
2728 Curve::Type curveType) {
2729
2730 ExecutionPolicy executionPolicy;
2731 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce_test2, subDomainKeyTree, tree,
2732 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2733 curveType);
2734 }
2735
2736 real Launch::symbolicForce_test3(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2737 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2738 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
2739 Curve::Type curveType) {
2740
2741 ExecutionPolicy executionPolicy;
2742 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce_test3, subDomainKeyTree, tree,
2743 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2744 curveType);
2745 }
2746
2747 real Launch::symbolicForce_test4(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2748 DomainList *domainList, integer *sendIndices, real diam, real theta_,
2749 integer n, integer m, integer relevantProc, integer relevantIndicesCounter,
2750 Curve::Type curveType) {
2751
2752 ExecutionPolicy executionPolicy;
2753 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::symbolicForce_test4, subDomainKeyTree, tree,
2754 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2755 curveType);
2756 }
2757
2758 real Launch::compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2759 DomainList *domainList, Curve::Type curveType) {
2760 ExecutionPolicy executionPolicy;
2761 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::compTheta, subDomainKeyTree, tree, particles,
2762 domainList, curveType);
2763 }
2764
2765 //real Launch::insertReceivedPseudoParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2766 // integer *levels, int n, int m) {
2767 // ExecutionPolicy executionPolicy(1, 256);
2768 // return cuda::launch(true, executionPolicy, ::Gravity::Kernel::insertReceivedPseudoParticles,
2769 // subDomainKeyTree, tree, particles, levels, n, m);
2770 //}
2771
2772 real Launch::insertReceivedPseudoParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2773 integer *levels, int level, int n, int m) {
2774 ExecutionPolicy executionPolicy;
2775 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::insertReceivedPseudoParticles,
2776 subDomainKeyTree, tree, particles, levels, level, n, m);
2777 }
2778
2779 real Launch::insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles,
2780 DomainList *domainList, DomainList *lowestDomainList, int n, int m) {
2781 ExecutionPolicy executionPolicy;
2782 return cuda::launch(true, executionPolicy, ::Gravity::Kernel::insertReceivedParticles, subDomainKeyTree,
2783 tree, particles, domainList, lowestDomainList, n, m);
2784 }
2785
2786 }
2787}
DomainList
Definition: subdomain.cuh:494
DomainList::domainListIndex
integer * domainListIndex
domain list node index, thus amount of domain list nodes
Definition: subdomain.cuh:503
DomainList::relevantDomainListOriginalIndex
integer * relevantDomainListOriginalIndex
Definition: subdomain.cuh:517
DomainList::domainListIndices
integer * domainListIndices
domain list node indices
Definition: subdomain.cuh:499
DomainList::domainListLevels
integer * domainListLevels
domain list node levels
Definition: subdomain.cuh:501
DomainList::relevantDomainListIndices
integer * relevantDomainListIndices
concentrate domain list nodes, usable to reduce domain list indices in respect to some criterion
Definition: subdomain.cuh:511
DomainList::domainListCounter
integer * domainListCounter
domain list node counter, usable as buffer
Definition: subdomain.cuh:505
DomainList::relevantDomainListProcess
integer * relevantDomainListProcess
Definition: subdomain.cuh:515
DomainList::borders
real * borders
Definition: subdomain.cuh:519
DomainList::relevantDomainListLevels
integer * relevantDomainListLevels
Definition: subdomain.cuh:513
ExecutionPolicy
Execution policy/instruction for CUDA kernel execution.
Definition: cuda_launcher.cuh:33
Particles
Particle(s) class based on SoA (Structur of Arrays).
Definition: particles.cuh:50
Particles::x
real * x
(pointer to) x position (array)
Definition: particles.cuh:62
Particles::level
integer * level
(pointer to) level of the (pseudo-)particles
Definition: particles.cuh:106
Particles::g_ay
real * g_ay
(pointer to) y gravitational acceleration (array)
Definition: particles.cuh:92
Particles::nodeType
integer * nodeType
(pointer to) node type
Definition: particles.cuh:103
Particles::y
real * y
(pointer to) y position (array)
Definition: particles.cuh:70
Particles::u
real * u
energy (kinetic + gravitational for now)
Definition: particles.cuh:126
Particles::g_ax
real * g_ax
(pointer to) x gravitational acceleration (array)
Definition: particles.cuh:88
Particles::mass
real * mass
(pointer to) mass (array)
Definition: particles.cuh:60
Particles::z
real * z
(pointer to) z position (array)
Definition: particles.cuh:78
Particles::g_az
real * g_az
(pointer to) z gravitational acceleration (array)
Definition: particles.cuh:96
SubDomainKeyTree
SubDomainKeyTree class handling rank, number of processes and ranges.
Definition: subdomain.cuh:62
SubDomainKeyTree::key2proc
CUDA_CALLABLE_MEMBER integer key2proc(keyType key)
Compute particle's MPI process belonging by it's key.
Definition: subdomain.cu:68
SubDomainKeyTree::rank
integer rank
MPI rank.
Definition: subdomain.cuh:66
Tree
Tree class.
Definition: tree.cuh:140
cudaTerminate
#define cudaTerminate(...)
Definition: cuda_utilities.cuh:70
cudaAssert
#define cudaAssert(...)
Definition: cuda_utilities.cuh:50
Constants::G
constexpr real G
Gravitational constant.
Definition: parameter.h:119
DomainListNS::Kernel::lowestDomainList
__global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Kernel to create the lowest domain list.
Definition: subdomain.cu:2039
Gravity::Kernel::collectSendIndices_test4
__global__ void collectSendIndices_test4(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
Definition: gravity.cu:53
Gravity::Kernel::testSendIndices
__global__ void testSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer *sendIndices, integer *markedSendIndices, integer *levels, Curve::Type curveType, integer length)
Test the send indices.
Definition: gravity.cu:103
Gravity::Kernel::intermediateSymbolicForce
__global__ void intermediateSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1374
Gravity::Kernel::computeForces_v1_2
__global__ void computeForces_v1_2(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:550
Gravity::Kernel::preSymbolicForce
__global__ void preSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1326
Gravity::Kernel::symbolicForce
__global__ void symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1395
Gravity::Kernel::symbolicForce_test3
__global__ void symbolicForce_test3(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: gravity.cu:1916
Gravity::Kernel::computeForces_v1_1
__global__ void computeForces_v1_1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:398
Gravity::Kernel::collectSendIndices
__global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer n, integer length, Curve::Type curveType)
Collect the send indices.
Definition: gravity.cu:8
Gravity::Kernel::symbolicForce_test
__global__ void symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1666
Gravity::Kernel::symbolicForce_test2
__global__ void symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: gravity.cu:1801
Gravity::Kernel::insertReceivedParticles
__global__ void insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m)
Definition: gravity.cu:2467
Gravity::Kernel::computeForces_v2
__global__ void computeForces_v2(Tree *tree, Particles *particles, real radius, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:726
Gravity::Kernel::insertReceivedPseudoParticles
__global__ void insertReceivedPseudoParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer *levels, int level, int n, int m)
Definition: gravity.cu:2243
Gravity::Kernel::computeForces_v2_1
__global__ void computeForces_v2_1(Tree *tree, Particles *particles, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:905
Gravity::Kernel::computeForces_v1
__global__ void computeForces_v1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
Definition: gravity.cu:252
Gravity::Kernel::symbolicForce_test4
__global__ void symbolicForce_test4(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
Definition: gravity.cu:2041
Gravity::Kernel::resetSendIndices
__global__ void resetSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
Definition: gravity.cu:1354
Gravity::Kernel::compTheta
__global__ void compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, Curve::Type curveType)
Definition: gravity.cu:2180
Gravity
Gravity related kernels/functions.
Definition: gravity.cuh:23
Kernel
Definition: device_rhs.cuh:7
ProfilerIds::Time::tree
const char *const tree
Definition: h5profiler.h:57
ProfilerIds::numParticlesLocal
const char *const numParticlesLocal
Definition: h5profiler.h:30
ProfilerIds::numParticles
const char *const numParticles
Definition: h5profiler.h:29
SPH::Kernel::particles2Send
__global__ void particles2Send(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer maxLevel, integer *toSend, integer *sendCount, integer *alreadyInserted, integer insertOffset, integer numParticlesLocal, integer numParticles, integer numNodes, real radius, Curve::Type curveType=Curve::lebesgue)
Definition: sph.cu:1575
cuda::math::rsqrt
__device__ real rsqrt(real a)
Inverse square root of a floating point value.
Definition: cuda_utilities.cu:464
cuda::math::sqrt
__device__ real sqrt(real a)
Square root of a floating point value.
Definition: cuda_utilities.cu:456
cuda::math::max
__device__ real max(real a, real b)
Maximum value out of two floating point values.
Definition: cuda_utilities.cu:431
cuda::launch
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
Definition: cuda_launcher.cuh:114
real
double real
Definition: parameter.h:15
keyType
unsigned long keyType
Definition: parameter.h:18
integer
int integer
Definition: parameter.h:17
MAX_LEVEL
#define MAX_LEVEL
Definition: parameter.h:25
MAX_DEPTH
#define MAX_DEPTH
Definition: parameter.h:105
POW_DIM
#define POW_DIM
Definition: parameter.h:40
DIM
#define DIM
Dimension of the problem.
Definition: parameter.h:38
Curve::Type
Type
Definition: parameter.h:206

milupHPC - src/gravity/gravity.cu Source File
Generated on Wed Aug 31 2022 12:16:52 by Doxygen 1.9.3