1#include "../../include/gravity/gravity.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
14 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
15 integer stride = blockDim.x*gridDim.x;
19 integer pseudoParticleInsertIndex;
21 while ((bodyIndex + offset) < length) {
23 if (sendIndices[bodyIndex + offset] == 1) {
26 if (bodyIndex + offset < n) {
27 particleInsertIndex = atomicAdd(particlesCount, 1);
32 pseudoParticleInsertIndex = atomicAdd(pseudoParticlesCount, 1);
33 pseudoParticles2Send[pseudoParticleInsertIndex] = bodyIndex + offset;
34 pseudoParticlesLevel[pseudoParticleInsertIndex] = particles->
level[bodyIndex + offset];
60 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
61 integer stride = blockDim.x*gridDim.x;
65 integer pseudoParticleInsertIndex;
69 if ((sendIndices[bodyIndex + offset] >> currentProc) & 1) {
71 particleInsertIndex = atomicAdd(particlesCount, 1);
81 while ((bodyIndex + offset) < treeIndex) {
82 if ((sendIndices[bodyIndex + offset] >> currentProc) & 1) {
83 pseudoParticleInsertIndex = atomicAdd(pseudoParticlesCount, 1);
84 pseudoParticles2Send[pseudoParticleInsertIndex] = bodyIndex + offset;
85 pseudoParticlesLevel[pseudoParticleInsertIndex] = particles->
level[bodyIndex + offset];
107 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
108 integer stride = blockDim.x*gridDim.x;
116 bool available =
false;
161 while ((bodyIndex + offset) < length) {
180 if (levels[bodyIndex + offset] > 3) {
188 for (
int j = 0; j < levels[bodyIndex + offset] - 1; j++) {
193 if (particles->
x[sendIndices[bodyIndex + offset]] < 0.5 * (min_x + max_x)) {
195 max_x = 0.5 * (min_x + max_x);
197 min_x = 0.5 * (min_x + max_x);
200 if (particles->
y[sendIndices[bodyIndex + offset]] < 0.5 * (min_y + max_y)) {
202 max_y = 0.5 * (min_y + max_y);
204 min_y = 0.5 * (min_y + max_y);
207 if (particles->
z[sendIndices[bodyIndex + offset]] < 0.5 * (min_z + max_z)) {
209 max_z = 0.5 * (min_z + max_z);
211 min_z = 0.5 * (min_z + max_z);
216 childIndex =
tree->child[
POW_DIM * temp + childPath];
217 if (bodyIndex + offset == 0) {
218 printf(
"tree->child[POW_DIM * %i + %i] = %i (%i)\n", temp, childPath,
tree->child[
POW_DIM * temp + childPath], sendIndices[bodyIndex + offset]);
223 for (
int i = 0; i < length; i++) {
224 if (temp == sendIndices[i]) {
234 cudaTerminate(
"[rank %i] %i (relevant son: %i) NOT Available sendIndices[%i] = %i, [%i] = %i)!\n",
235 subDomainKeyTree->
rank, temp, childIndex, childIndex,
236 markedSendIndices[childIndex], temp, markedSendIndices[temp])
254 bool potentialEnergy) {
257 int child, nodeIndex, childNumber, depth;
259 real px, ax, dx, f, distance;
267 real thetasq = theta*theta;
274 if (threadIdx.x == 0) {
275 cellSize[0] = 4.0 * radius * radius;
278 cellSize[i] = cellSize[i - 1] * 0.25;
284 for (ii = threadIdx.x + blockIdx.x * blockDim.x; ii < n; ii += blockDim.x * gridDim.x) {
286 i =
tree->sorted[ii];
288 px = particles->
x[i];
290 py = particles->
y[i];
292 pz = particles->
z[i];
297 particles->
g_ax[i] = 0.0;
300 particles->
g_ay[i] = 0.0;
303 particles->
g_az[i] = 0.0;
316 currentNodeIndex[depth] = 0;
317 currentChildNumber[depth] = 0;
320 childNumber = currentChildNumber[depth];
321 nodeIndex = currentNodeIndex[depth];
326 child =
tree->child[
POW_DIM * nodeIndex + childNumber];
328 }
while(child == -1 && childNumber <
POW_DIM);
330 if (child != -1 && child != i) {
332 dx = particles->
x[child] - px;
333 distance = dx*dx + smoothing;
335 dy = particles->
y[child] - py;
339 dz = particles->
z[child] - pz;
343 if (child < n || distance * thetasq > cellSize[depth]) {
347 f =
Constants::G * particles->
mass[child] / (distance * distance * distance);
349 f = particles->
mass[child] / (distance * distance * distance);
360 if (potentialEnergy) {
364 particles->
u[i] -= 0.5 * (particles->
mass[child] * particles->
mass[i]) / distance;
369 currentChildNumber[depth] = childNumber;
370 currentNodeIndex[depth] = nodeIndex;
386 particles->
g_ax[i] = ax;
389 particles->
g_ay[i] = ay;
392 particles->
g_az[i] = az;
400 bool potentialEnergy) {
402 integer i, ii, child, nodeIndex, childNumber, depth;
404 real px, ax, dx, f, distance;
413 real thetasq = theta*theta;
420 if (threadIdx.x == 0) {
421 cellSize[0] = 4.0 * radius * radius;
424 cellSize[i] = cellSize[i - 1] * 0.25;
430 for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
432 px = particles->
x[i];
434 py = particles->
y[i];
436 pz = particles->
z[i];
440 particles->
g_ax[i] = 0.0;
443 particles->
g_ay[i] = 0.0;
446 particles->
g_az[i] = 0.0;
459 currentNodeIndex[depth] = 0;
460 currentChildNumber[depth] = 0;
463 childNumber = currentChildNumber[depth];
464 nodeIndex = currentNodeIndex[depth];
468 child =
tree->child[
POW_DIM * nodeIndex + childNumber];
470 }
while(child == -1 && childNumber <
POW_DIM);
471 if (child != -1 && child != i) {
472 dx = particles->
x[child] - px;
473 distance = dx*dx + smoothing;
475 dy = particles->
y[child] - py;
479 dz = particles->
z[child] - pz;
483 if (child < n || distance * thetasq > cellSize[depth]) {
499 distance =
sqrt(distance);
501 f =
Constants::G * particles->
mass[child] / (distance * distance * distance);
503 f = particles->
mass[child] / (distance * distance * distance);
514 if (potentialEnergy) {
518 particles->
u[i] -= 0.5 * (particles->
mass[child] * particles->
mass[i])/distance;
523 currentChildNumber[depth] = childNumber;
524 currentNodeIndex[depth] = nodeIndex;
538 particles->
g_ax[i] = ax;
541 particles->
g_ay[i] = ay;
544 particles->
g_az[i] = az;
552 bool potentialEnergy) {
555 int child, nodeIndex, childNumber, depth;
557 real px, ax, dx, f, distance;
565 real thetasq = theta*theta;
567 __shared__
int currentNodeIndex[
MAX_DEPTH];
568 __shared__
int currentChildNumber[
MAX_DEPTH];
572 if (threadIdx.x == 0) {
573 cellSize[0] = 4.0 * radius * radius;
576 cellSize[i] = cellSize[i - 1] * 0.25;
582 for (ii = threadIdx.x + blockIdx.x * blockDim.x; ii < n; ii += blockDim.x * gridDim.x) {
584 i =
tree->sorted[ii];
586 px = particles->
x[i];
588 py = particles->
y[i];
590 pz = particles->
z[i];
594 particles->
g_ax[i] = 0.0;
597 particles->
g_ay[i] = 0.0;
600 particles->
g_az[i] = 0.0;
613 currentNodeIndex[depth] = 0;
614 currentChildNumber[depth] = 0;
617 childNumber = currentChildNumber[depth];
618 nodeIndex = currentNodeIndex[depth];
622 child =
tree->child[
POW_DIM * nodeIndex + childNumber];
624 }
while(child == -1 && childNumber <
POW_DIM);
625 if (child != -1 && child != i) {
626 dx = particles->
x[child] - px;
627 distance = dx*dx + smoothing;
629 dy = particles->
y[child] - py;
633 dz = particles->
z[child] - pz;
637 if (child < n || distance * thetasq > cellSize[depth]) {
640 f =
Constants::G * particles->
mass[child] / (distance * distance * distance);
642 f = particles->
mass[child] / (distance * distance * distance);
653 if (potentialEnergy) {
657 particles->
u[i] -= 0.5 * (particles->
mass[child] * particles->
mass[i])/distance;
662 currentChildNumber[depth] = childNumber;
663 currentNodeIndex[depth] = nodeIndex;
677 particles->
g_ax[i] = ax;
680 particles->
g_ay[i] = ay;
683 particles->
g_az[i] = az;
729 real smoothing,
bool potentialEnergy) {
731 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
732 integer stride = blockDim.x*gridDim.x;
735 register int sortedIndex;
739 extern __shared__
real buffer[];
764 if (
tree->child[i] != -1) {
769 int counter = threadIdx.x % warp;
770 int stackStartIndex = stackSize*(threadIdx.x / warp);
772 while ((bodyIndex + offset) < n) {
775 sortedIndex =
tree->sorted[bodyIndex + offset];
777 pos_x = particles->
x[sortedIndex];
779 pos_y = particles->
y[sortedIndex];
781 pos_z = particles->
z[sortedIndex];
794 integer top = jj + stackStartIndex;
801 for (
int i=0; i<
POW_DIM; i++) {
802 if (
tree->child[i] != -1) {
803 stack[stackStartIndex + temp] =
tree->child[i];
804 depth[stackStartIndex + temp] = radius*radius/theta;
812 while (top >= stackStartIndex) {
816 real dp = 0.25 * depth[top];
826 real dx = particles->
x[ch] - pos_x;
828 real dy = particles->
y[ch] - pos_y;
830 real dz = particles->
z[ch] - pos_z;
834 real r = dx*dx + smoothing;
843 if (ch < m || __all_sync(__activemask(), dp <= r)) {
853 real f = particles->
mass[ch] * r * r * r;
863 if (potentialEnergy) {
868 particles->
u[bodyIndex + offset] -= 0.5 * (particles->
mass[ch] *
880 __threadfence_block();
891 particles->
g_ax[sortedIndex] = acc_x;
893 particles->
g_ay[sortedIndex] = acc_y;
895 particles->
g_az[sortedIndex] = acc_z;
907 real theta,
real smoothing,
bool potentialEnergy) {
909 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
910 integer stride = blockDim.x*gridDim.x;
915 extern __shared__
real buffer[];
929 real radius = x_radius;
940 if (
tree->child[i] != -1) {
945 integer counter = threadIdx.x % warp;
946 integer stackStartIndex = stackSize*(threadIdx.x / warp);
948 while ((bodyIndex + offset) < n) {
950 real pos_x = particles->
x[bodyIndex + offset];
952 real pos_y = particles->
y[bodyIndex + offset];
954 real pos_z = particles->
z[bodyIndex + offset];
967 integer top = jj + stackStartIndex;
973 for (
int i=0; i<
POW_DIM; i++) {
974 if (
tree->child[i] != -1) {
975 stack[stackStartIndex + temp] =
tree->child[i];
976 depth[stackStartIndex + temp] = radius*radius/theta;
984 while (top >= stackStartIndex) {
988 real dp = 0.5 * depth[top];
998 real dx = particles->
x[ch] - pos_x;
1000 real dy = particles->
y[ch] - pos_y;
1002 real dz = particles->
z[ch] - pos_z;
1006 real r = dx*dx + smoothing;
1015 if (ch < m || __all_sync(__activemask(), dp <= r)) {
1025 real f = particles->
mass[ch] * r * r * r;
1035 if (potentialEnergy) {
1040 particles->
u[bodyIndex + offset] -= 0.5 * (particles->
mass[ch] *
1062 particles->
g_ax[bodyIndex + offset] = acc_x;
1064 particles->
g_ay[bodyIndex + offset] = acc_y;
1066 particles->
g_az[bodyIndex + offset] = acc_z;
1332 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1333 int stride = blockDim.x * gridDim.x;
1338 while ((bodyIndex + offset) < *
tree->index) {
1340 if (particles->
level[bodyIndex + offset] <= 10) {
1342 for (
int i = 0; i <
POW_DIM; i++) {
1343 childIndex =
tree->child[
POW_DIM * (bodyIndex + offset) + i];
1344 if (childIndex != -1 && particles->
nodeType[childIndex] == 0) {
1345 sendIndices[childIndex] = 2;
1360 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1361 int stride = blockDim.x * gridDim.x;
1364 while ((bodyIndex + offset) < *
tree->index) {
1366 if (sendIndices[bodyIndex + offset] != 2) {
1367 sendIndices[bodyIndex + offset] = -1;
1379 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1380 integer stride = blockDim.x * gridDim.x;
1383 while ((bodyIndex + offset) < *
tree->index) {
1384 if (sendIndices[bodyIndex + offset] == 2) {
1385 sendIndices[bodyIndex + offset] = 0;
1387 if (sendIndices[bodyIndex + offset] == 3) {
1388 sendIndices[bodyIndex + offset] = 1;
1400 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1401 int stride = blockDim.x * gridDim.x;
1405 int domainListLevel;
1406 int currentDomainListIndex;
1407 int currentParticleIndex;
1413 bool isDomainListNode;
1436 while ((bodyIndex + offset) <
POW_DIM) {
1445 if (
tree->child[bodyIndex + offset] != -1) {
1446 sendIndices[
tree->child[bodyIndex + offset]] = 0;
1453 while ((bodyIndex + offset) < *
tree->index) {
1464 currentParticleIndex = bodyIndex + offset;
1466 if ((sendIndices[currentParticleIndex] == 0 || sendIndices[currentParticleIndex] == 3) && (currentParticleIndex < n || currentParticleIndex >= m )) {
1469 isDomainListNode =
false;
1471 if (sendIndices[currentParticleIndex] == 0) {
1482 if (particles->
nodeType[bodyIndex + offset] >= 1) {
1484 isDomainListNode =
true;
1488 if (!isDomainListNode) {
1490 tree->getParticleKey(particles, currentParticleIndex,
MAX_LEVEL, curveType)) !=
1491 subDomainKeyTree->
rank) {
1498 sendIndices[currentParticleIndex] = 3;
1500 sendIndices[currentParticleIndex] = -1;
1506 particleLevel = particles->
level[currentParticleIndex];
1512 if (particleLevel < 0) {
1513 printf(
"WTF particleLevel = %i for %i (%e, %e, %e) (numParticlesLocal = %i, index = %i)\n",
1514 particleLevel, currentParticleIndex, particles->
x[currentParticleIndex],
1515 particles->
y[currentParticleIndex], particles->
z[currentParticleIndex],
1528 if (domainListLevel == -1) {
1529 cudaAssert(
"symbolicForce(): domainListLevel == -1 for (relevant) index: %i\n",
1598 if (particles->
x[currentParticleIndex] < min_x) {
1599 dx = particles->
x[currentParticleIndex] - min_x;
1600 }
else if (particles->
x[currentParticleIndex] > max_x) {
1601 dx = particles->
x[currentParticleIndex] - max_x;
1606 if (particles->
y[currentParticleIndex] < min_y) {
1607 dy = particles->
y[currentParticleIndex] - min_y;
1608 }
else if (particles->
y[currentParticleIndex] > max_y) {
1609 dy = particles->
y[currentParticleIndex] - max_y;
1614 if (particles->
z[currentParticleIndex] < min_z) {
1615 dz = particles->
z[currentParticleIndex] - min_z;
1616 }
else if (particles->
z[currentParticleIndex] > max_z) { dz =
1617 particles->
z[currentParticleIndex] - max_z;
1634 if (particleLevel != -1 && (((powf(0.5, particleLevel-1) * diam) >= (theta_ * r)) || isDomainListNode)) {
1637 for (
int i = 0; i <
POW_DIM; i++) {
1651 if (
tree->child[
POW_DIM * currentParticleIndex + i] != -1) {
1652 if (sendIndices[
tree->child[
POW_DIM * currentParticleIndex + i]] != 1) {
1653 sendIndices[
tree->child[
POW_DIM * currentParticleIndex + i]] = 2;
1671 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1672 int stride = blockDim.x * gridDim.x;
1675 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1689 while ((bodyIndex + offset) < *
tree->index) {
1691 currentParticleIndex = bodyIndex + offset;
1706 particleLevel = particles->
level[currentParticleIndex];
1752 if (particles->
x[currentParticleIndex] < min_x) {
1753 dx = particles->
x[currentParticleIndex] - min_x;
1754 }
else if (particles->
x[currentParticleIndex] > max_x) {
1755 dx = particles->
x[currentParticleIndex] - max_x;
1760 if (particles->
y[currentParticleIndex] < min_y) {
1761 dy = particles->
y[currentParticleIndex] - min_y;
1762 }
else if (particles->
y[currentParticleIndex] > max_y) {
1763 dy = particles->
y[currentParticleIndex] - max_y;
1768 if (particles->
z[currentParticleIndex] < min_z) {
1769 dz = particles->
z[currentParticleIndex] - min_z;
1770 }
else if (particles->
z[currentParticleIndex] > max_z) {
1771 dz = particles->
z[currentParticleIndex] - max_z;
1786 if ((powf(0.5, particleLevel-1) * diam) >= (theta_ * r)) {
1788 for (
int i = 0; i <
POW_DIM; i++) {
1789 childIndex =
tree->child[
POW_DIM * currentParticleIndex + i];
1790 if (childIndex != -1 && particles->
nodeType[childIndex] == 0) {
1791 sendIndices[childIndex] = 1;
1806 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1807 int stride = blockDim.x * gridDim.x;
1810 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1826 while ((bodyIndex + offset) < *
tree->index) {
1830 for (
int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1841 currentParticleIndex = bodyIndex + offset;
1843 particleLevel = particles->
level[currentParticleIndex];
1865 if (particles->
x[currentParticleIndex] < min_x) {
1866 dx = particles->
x[currentParticleIndex] - min_x;
1867 }
else if (particles->
x[currentParticleIndex] > max_x) {
1868 dx = particles->
x[currentParticleIndex] - max_x;
1873 if (particles->
y[currentParticleIndex] < min_y) {
1874 dy = particles->
y[currentParticleIndex] - min_y;
1875 }
else if (particles->
y[currentParticleIndex] > max_y) {
1876 dy = particles->
y[currentParticleIndex] - max_y;
1881 if (particles->
z[currentParticleIndex] < min_z) {
1882 dz = particles->
z[currentParticleIndex] - min_z;
1883 }
else if (particles->
z[currentParticleIndex] > max_z) {
1884 dz = particles->
z[currentParticleIndex] - max_z;
1899 if ((powf(0.5, particleLevel - 1) * diam) >= (theta_ * r)) {
1902 for (
int i = 0; i <
POW_DIM; i++) {
1903 childIndex =
tree->child[
POW_DIM * currentParticleIndex + i];
1904 if (childIndex != -1 && particles->
nodeType[childIndex] == 0) {
1905 sendIndices[childIndex] = 1;
1921 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1922 int stride = blockDim.x * gridDim.x;
1925 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex;
1941 while ((bodyIndex + offset) < *
tree->index) {
1945 for (
int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1956 currentParticleIndex = bodyIndex + offset;
1958 particleLevel = particles->
level[currentParticleIndex];
1961 if (particleLevel <= 6) {
1989 if (particles->
x[currentParticleIndex] < min_x) {
1990 dx = particles->
x[currentParticleIndex] - min_x;
1991 }
else if (particles->
x[currentParticleIndex] > max_x) {
1992 dx = particles->
x[currentParticleIndex] - max_x;
1997 if (particles->
y[currentParticleIndex] < min_y) {
1998 dy = particles->
y[currentParticleIndex] - min_y;
1999 }
else if (particles->
y[currentParticleIndex] > max_y) {
2000 dy = particles->
y[currentParticleIndex] - max_y;
2005 if (particles->
z[currentParticleIndex] < min_z) {
2006 dz = particles->
z[currentParticleIndex] - min_z;
2007 }
else if (particles->
z[currentParticleIndex] > max_z) {
2008 dz = particles->
z[currentParticleIndex] - max_z;
2024 if ((powf(0.5, particleLevel - 1) * diam) >= (theta_ * r)) {
2027 for (
int i = 0; i <
POW_DIM; i++) {
2028 childIndex =
tree->child[
POW_DIM * currentParticleIndex + i];
2029 if (childIndex != -1 && particles->
nodeType[childIndex] == 0) {
2030 sendIndices[childIndex] = 1;
2046 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2047 int stride = blockDim.x * gridDim.x;
2050 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex, currentProc;
2067 while ((bodyIndex + offset) < *
tree->index) {
2071 for (
int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
2087 if ((added >> currentProc) & 1) {
2092 currentParticleIndex = bodyIndex + offset;
2094 particleLevel = particles->
level[currentParticleIndex];
2097 if (particleLevel <= 6) {
2125 if (particles->
x[currentParticleIndex] < min_x) {
2126 dx = particles->
x[currentParticleIndex] - min_x;
2127 }
else if (particles->
x[currentParticleIndex] > max_x) {
2128 dx = particles->
x[currentParticleIndex] - max_x;
2133 if (particles->
y[currentParticleIndex] < min_y) {
2134 dy = particles->
y[currentParticleIndex] - min_y;
2135 }
else if (particles->
y[currentParticleIndex] > max_y) {
2136 dy = particles->
y[currentParticleIndex] - max_y;
2141 if (particles->
z[currentParticleIndex] < min_z) {
2142 dz = particles->
z[currentParticleIndex] - min_z;
2143 }
else if (particles->
z[currentParticleIndex] > max_z) {
2144 dz = particles->
z[currentParticleIndex] - max_z;
2160 if ((powf(0.5, particleLevel - 1) * diam) >= (theta_ * r)) {
2162 added = added | (1 << currentProc);
2164 for (
int i = 0; i <
POW_DIM; i++) {
2165 childIndex =
tree->child[
POW_DIM * currentParticleIndex + i];
2166 if (childIndex != -1 && particles->
nodeType[childIndex] == 0) {
2167 sendIndices[childIndex] = sendIndices[childIndex] | (1 << currentProc);
2183 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2184 integer stride = blockDim.x * gridDim.x;
2199 key =
tree->getParticleKey(particles, bodyIndex,
MAX_LEVEL, curveType);
2201 proc = subDomainKeyTree->
key2proc(key);
2225 if (proc != subDomainKeyTree->
rank && proc >= 0 && particles->
mass[bodyIndex] > 0.f) {
2244 integer *levels,
int level,
int n,
int m) {
2246 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2247 integer stride = blockDim.x * gridDim.x;
2297 while ((bodyIndex + offset) < (
tree->toDeleteNode[1] -
tree->toDeleteNode[0])) {
2306 if (levels[bodyIndex + offset] == level) {
2308 min_x = *
tree->minX;
2309 max_x = *
tree->maxX;
2311 min_y = *
tree->minY;
2312 max_y = *
tree->maxY;
2314 min_z = *
tree->minZ;
2315 max_z = *
tree->maxZ;
2322 if (particles->
x[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_x + max_x)) {
2324 max_x = 0.5 * (min_x + max_x);
2327 min_x = 0.5 * (min_x + max_x);
2330 if (particles->
y[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_y + max_y)) {
2332 max_y = 0.5 * (min_y + max_y);
2335 min_y = 0.5 * (min_y + max_y);
2338 if (particles->
z[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_z + max_z)) {
2340 max_z = 0.5 * (min_z + max_z);
2343 min_z = 0.5 * (min_z + max_z);
2347 int childIndex =
tree->child[temp*
POW_DIM + childPath];
2372 while (childIndex >= m) {
2379 if (particles->
x[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_x + max_x)) {
2381 max_x = 0.5 * (min_x + max_x);
2384 min_x = 0.5 * (min_x + max_x);
2387 if (particles->
y[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_y + max_y)) {
2389 max_y = 0.5 * (min_y + max_y);
2392 min_y = 0.5 * (min_y + max_y);
2395 if (particles->
z[
tree->toDeleteNode[0] + bodyIndex + offset] < 0.5 * (min_z + max_z)) {
2397 max_z = 0.5 * (min_z + max_z);
2400 min_z = 0.5 * (min_z + max_z);
2405 childIndex =
tree->child[
POW_DIM*temp + childPath];
2409 if (childIndex != -1) {
2410 printf(
"[rank %i] (%f, %f, %f) vs (%f, %f, %f)\n", subDomainKeyTree->
rank,
2411 particles->
x[
tree->toDeleteNode[0] + bodyIndex + offset],
2412 particles->
y[
tree->toDeleteNode[0] + bodyIndex + offset],
2413 particles->
z[
tree->toDeleteNode[0] + bodyIndex + offset],
2414 particles->
x[childIndex],
2415 particles->
y[childIndex],
2416 particles->
z[childIndex]);
2417 cudaAssert(
"insertReceivedPseudoParticles(): childIndex = %i temp = %i\n", childIndex, temp);
2424 tree->child[
POW_DIM*temp + childPath] =
tree->toDeleteNode[0] + bodyIndex + offset;
2428 if (levels[bodyIndex + offset] != insertionLevel) {
2457 cudaAssert(
"insertReceivedPseudoParticles() for %i: level[%i] = %i != insertionLevel = %i!\n",
2458 tree->toDeleteNode[0] + bodyIndex + offset, bodyIndex + offset,
2459 levels[bodyIndex + offset], insertionLevel);
2470 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2471 integer stride = blockDim.x * gridDim.x;
2487 bodyIndex +=
tree->toDeleteLeaf[0];
2489 while ((bodyIndex + offset) <
tree->toDeleteLeaf[1]) {
2491 min_x = *
tree->minX;
2492 max_x = *
tree->maxX;
2494 min_y = *
tree->minY;
2495 max_y = *
tree->maxY;
2497 min_z = *
tree->minZ;
2498 max_z = *
tree->maxZ;
2506 if (particles->
x[bodyIndex + offset] < 0.5 * (min_x + max_x)) {
2508 max_x = 0.5 * (min_x + max_x);
2511 min_x = 0.5 * (min_x + max_x);
2514 if (particles->
y[bodyIndex + offset] < 0.5 * (min_y + max_y)) {
2516 max_y = 0.5 * (min_y + max_y);
2519 min_y = 0.5 * (min_y + max_y);
2522 if (particles->
z[bodyIndex + offset] < 0.5 * (min_z + max_z)) {
2524 max_z = 0.5 * (min_z + max_z);
2527 min_z = 0.5 * (min_z + max_z);
2531 int childIndex =
tree->child[temp*
POW_DIM + childPath];
2534 while (childIndex >= m) {
2541 if (particles->
x[bodyIndex + offset] < 0.5 * (min_x + max_x)) {
2543 max_x = 0.5 * (min_x + max_x);
2546 min_x = 0.5 * (min_x + max_x);
2549 if (particles->
y[bodyIndex + offset] < 0.5 * (min_y + max_y)) {
2551 max_y = 0.5 * (min_y + max_y);
2554 min_y = 0.5 * (min_y + max_y);
2557 if (particles->
z[bodyIndex + offset] < 0.5 * (min_z + max_z)) {
2559 max_z = 0.5 * (min_z + max_z);
2562 min_z = 0.5 * (min_z + max_z);
2567 childIndex =
tree->child[
POW_DIM*temp + childPath];
2571 if (childIndex != -1) {
2572 cudaAssert(
"ATTENTION: insertReceivedParticles(): childIndex = %i (%i, %i) (%i, %i)\n", childIndex,
2573 tree->toDeleteLeaf[0],
tree->toDeleteLeaf[1],
tree->toDeleteNode[0],
2574 tree->toDeleteNode[1]);
2582 tree->child[
POW_DIM*temp + childPath] = bodyIndex + offset;
2592 integer *pseudoParticlesLevel,
2597 particles2Send, pseudoParticles2Send, pseudoParticlesLevel, particlesCount,
2598 pseudoParticlesCount, n, length, curveType);
2603 integer *pseudoParticlesLevel,
2609 particles2Send, pseudoParticles2Send, pseudoParticlesLevel, particlesCount,
2619 tree, particles, sendIndices, markedSendIndices, levels, curveType, length);
2624 bool potentialEnergy) {
2629 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2634 bool potentialEnergy) {
2639 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2644 bool potentialEnergy) {
2645 size_t sharedMemory = (2*
sizeof(int) +
sizeof(
real)) *
MAX_DEPTH;
2649 radius, n, m, subDomainKeyTree, theta, smoothing, potentialEnergy);
2655 real smoothing,
bool potentialEnergy) {
2657 size_t sharedMemory = (
sizeof(
real)+
sizeof(
integer))*stackSize*blockSize/warp;
2662 n, m, blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2667 real theta,
real smoothing,
bool potentialEnergy) {
2669 size_t sharedMemory = (
sizeof(
real)+
sizeof(
integer))*stackSize*blockSize/warp;
2674 blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2680 real smoothing,
bool potentialEnergy) {
2682 size_t sharedMemory = (
sizeof(
real)+
sizeof(
integer))*stackSize*blockSize/warp;
2686 blockSize, warp, stackSize, subDomainKeyTree, theta, smoothing, potentialEnergy);
2704 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2713 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2722 particles, domainList, sendIndices, diam, theta_, n, m, relevantIndex, level, curveType);
2732 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2743 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2754 particles, domainList, sendIndices, diam, theta_, n, m, relevantProc, relevantIndicesCounter,
2762 domainList, curveType);
2773 integer *levels,
int level,
int n,
int m) {
2776 subDomainKeyTree,
tree, particles, levels, level, n, m);
integer * domainListIndex
domain list node index, thus amount of domain list nodes
integer * relevantDomainListOriginalIndex
integer * domainListIndices
domain list node indices
integer * domainListLevels
domain list node levels
integer * relevantDomainListIndices
concentrate domain list nodes, usable to reduce domain list indices in respect to some criterion
integer * domainListCounter
domain list node counter, usable as buffer
integer * relevantDomainListProcess
integer * relevantDomainListLevels
Execution policy/instruction for CUDA kernel execution.
Particle(s) class based on SoA (Structur of Arrays).
real * x
(pointer to) x position (array)
integer * level
(pointer to) level of the (pseudo-)particles
real * g_ay
(pointer to) y gravitational acceleration (array)
integer * nodeType
(pointer to) node type
real * y
(pointer to) y position (array)
real * u
energy (kinetic + gravitational for now)
real * g_ax
(pointer to) x gravitational acceleration (array)
real * mass
(pointer to) mass (array)
real * z
(pointer to) z position (array)
real * g_az
(pointer to) z gravitational acceleration (array)
SubDomainKeyTree class handling rank, number of processes and ranges.
CUDA_CALLABLE_MEMBER integer key2proc(keyType key)
Compute particle's MPI process belonging by it's key.
#define cudaTerminate(...)
constexpr real G
Gravitational constant.
__global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Kernel to create the lowest domain list.
__global__ void collectSendIndices_test4(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
__global__ void testSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer *sendIndices, integer *markedSendIndices, integer *levels, Curve::Type curveType, integer length)
Test the send indices.
__global__ void intermediateSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
__global__ void computeForces_v1_2(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
__global__ void preSymbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
__global__ void symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
__global__ void symbolicForce_test3(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
__global__ void computeForces_v1_1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
__global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer n, integer length, Curve::Type curveType)
Collect the send indices.
__global__ void symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
__global__ void symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
__global__ void insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m)
__global__ void computeForces_v2(Tree *tree, Particles *particles, real radius, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
__global__ void insertReceivedPseudoParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer *levels, int level, int n, int m)
__global__ void computeForces_v2_1(Tree *tree, Particles *particles, integer n, integer m, integer blockSize, integer warp, integer stackSize, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
__global__ void computeForces_v1(Tree *tree, Particles *particles, real radius, integer n, integer m, SubDomainKeyTree *subDomainKeyTree, real theta, real smoothing, bool potentialEnergy)
__global__ void symbolicForce_test4(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
__global__ void resetSendIndices(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real diam, real theta_, integer n, integer m, integer relevantIndex, integer level, Curve::Type curveType)
__global__ void compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, Curve::Type curveType)
Gravity related kernels/functions.
const char *const numParticlesLocal
const char *const numParticles
__global__ void particles2Send(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer maxLevel, integer *toSend, integer *sendCount, integer *alreadyInserted, integer insertOffset, integer numParticlesLocal, integer numParticles, integer numNodes, real radius, Curve::Type curveType=Curve::lebesgue)
__device__ real rsqrt(real a)
Inverse square root of a floating point value.
__device__ real sqrt(real a)
Square root of a floating point value.
__device__ real max(real a, real b)
Maximum value out of two floating point values.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
#define DIM
Dimension of the problem.