1#include "../../include/sph/sph.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
5#define MAX_VARIABLE_SML_ITERATIONS 5
7#define TOLERANCE_WANTED_NUMBER_OF_INTERACTIONS 5
15 boost::mpi::communicator comm;
17 std::vector<boost::mpi::request> reqParticles;
18 std::vector<boost::mpi::status> statParticles;
24 if (proc != subDomainKeyTree->
rank) {
25 reqParticles.push_back(comm.isend(proc, 17, toSend, sendLengths[proc]));
26 statParticles.push_back(comm.recv(proc, 17, &entry[
numParticlesLocal] + receiveOffset, receiveLengths[proc]));
27 receiveOffset += receiveLengths[proc];
30 boost::mpi::wait_all(reqParticles.begin(), reqParticles.end());
40 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
41 integer stride = blockDim.x * gridDim.x;
51 for (
int i=0; i<
tree->toDeleteLeaf[1]; ++i) {
52 if ((bodyIndex + offset) != i) {
54 dx = particles->
x[bodyIndex + offset] - particles->
x[i];
56 dy = particles->
y[bodyIndex + offset] - particles->
y[i];
58 dz = particles->
z[bodyIndex + offset] - particles->
z[i];
67 distance = dx * dx + dy * dy + dz * dz;
70 if (distance < (particles->
sml[bodyIndex + offset] * particles->
sml[bodyIndex + offset]) &&
71 distance < (particles->
sml[i] * particles->
sml[i])) {
80 cudaTerminate(
"numInteractions = %i > MAX_NUM_INTERACTIONS = %i\n", numInteractions,
86 particles->
noi[bodyIndex + offset] = numInteractions;
96 register int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
97 register int stride = blockDim.x * gridDim.x;
98 register int offset = 0;
101 register integer childNumber, nodeIndex, depth, childIndex;
114 register real d, r, interactionDistance;
116 register int noOfInteractions;
118 register int currentNodeIndex[
MAX_DEPTH];
119 register int currentChildNumber[
MAX_DEPTH];
126 index = bodyIndex + offset;
128 x = particles->
x[index];
130 y = particles->
y[index];
132 z = particles->
z[index];
135 sml = particles->
sml[index];
145 currentNodeIndex[depth] = 0;
146 currentChildNumber[depth] = 0;
147 noOfInteractions = 0;
151 interactionDistance = (r + sml);
156 childNumber = currentChildNumber[depth];
157 nodeIndex = currentNodeIndex[depth];
160 while (childNumber <
POW_DIM) {
162 childIndex =
tree->child[
POW_DIM * nodeIndex + childNumber];
165 if (childIndex != -1 && childIndex != index) {
175 smlj = particles->
sml[childIndex];
177 dx = x - particles->
x[childIndex];
179 dy = y - particles->
y[childIndex];
181 dz = z - particles->
z[childIndex];
191 d = dx*dx + dy*dy + dz*dz;
198 if (d < (sml * sml) &&
235 currentChildNumber[depth] = childNumber;
236 currentNodeIndex[depth] = nodeIndex;
252 interactionDistance = (r + sml);
262 nodeIndex = childIndex;
271 interactionDistance = (r + sml);
273 }
while (depth >= 0);
275 particles->
noi[index] = noOfInteractions;
286 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
287 int stride = blockDim.x * gridDim.x;
295 integer childNumber, nodeIndex, childIndex;
298 register real dx, min_x, max_x, x_temp, x_child, min_dx, max_dx, x;
300 register real dy, min_y, max_y, y_temp, y_child, min_dy, max_dy, y;
302 register real dz, min_z, max_z, z_temp, z_child, min_dz, max_dz, z;
311 register int outer_currentNodeIndex[
MAX_DEPTH];
312 register int outer_currentChildNumber[
MAX_DEPTH];
313 register int outer_currentNodeLevel[
MAX_DEPTH];
319 int *inner_currentNodeIndex;
320 int *inner_currentChildNumber;
322 int depth, inner_depth;
323 int noOfInteractions;
329 index = bodyIndex + offset;
335 x = particles->
x[index];
337 y = particles->
y[index];
339 z = particles->
z[index];
342 sml = particles->
sml[index];
345 outer_currentNodeIndex[depth] = 0;
346 outer_currentChildNumber[depth] = 0;
347 outer_currentNodeLevel[depth] = 1;
348 noOfInteractions = 0;
352 childNumber = outer_currentChildNumber[depth];
353 nodeIndex = outer_currentNodeIndex[depth];
354 level = outer_currentNodeLevel[depth];
356 while (childNumber <
POW_DIM) {
358 childIndex =
tree->child[
POW_DIM * nodeIndex + childNumber];
361 if (childIndex != -1 && childIndex != (index)) {
363 x_child = particles->
x[childIndex];
365 y_child = particles->
y[childIndex];
367 z_child = particles->
z[childIndex];
384 for (
int _level=0; _level < level; ++_level) {
385 if (x_child < 0.5 * (min_x + max_x)) {
386 max_x = 0.5 * (min_x + max_x);
389 min_x = 0.5 * (min_x + max_x);
392 if (y_child < 0.5 * (min_y + max_y)) {
393 max_y = 0.5 * (min_y + max_y);
396 min_y = 0.5 * (min_y + max_y);
399 if (z_child < 0.5 * (min_z + max_z)) {
400 max_z = 0.5 * (min_z + max_z);
403 min_z = 0.5 * (min_z + max_z);
412 }
else if (x > max_x) {
420 max_dx = (tmp1 > tmp2) ? tmp1 : tmp2;
426 }
else if (y > max_y) {
434 max_dy = (tmp1 > tmp2) ? tmp1 : tmp2;
440 }
else if (z > max_z) {
448 max_dz = (tmp1 > tmp2) ? tmp1 : tmp2;
455 min_distance = min_dx*min_dx;
456 max_distance = max_dx*max_dx;
459 min_distance = min_dx*min_dx + min_dy*min_dy;
460 max_distance = max_dx*max_dx + max_dy*max_dy;
462 min_distance = min_dx*min_dx + min_dy*min_dy + min_dz*min_dz;
463 max_distance = max_dx*max_dx + max_dy*max_dy + max_dz*max_dz;
493 d = dx*dx + dy*dy + dz*dz;
496 if (d < (sml * sml) &&
497 d < (particles->
sml[childIndex] * particles->
sml[childIndex])) {
502 cudaTerminate(
"noOfInteractions = %i > MAX_NUM_INTERACTIONS = %i\n",
508 else if ((sml*sml) > min_distance) {
510 if ((sml*sml) >= max_distance) {
512 inner_currentNodeIndex = &outer_currentNodeIndex[depth];
513 inner_currentChildNumber = &outer_currentChildNumber[depth];
516 inner_currentNodeIndex[inner_depth] = childIndex;
517 inner_currentChildNumber[inner_depth] = 0;
518 int inner_childNumber;
520 int inner_childIndex;
524 inner_childNumber = inner_currentChildNumber[inner_depth];
525 inner_nodeIndex = inner_currentNodeIndex[inner_depth];
527 while (inner_childNumber <
POW_DIM) {
529 inner_childIndex =
tree->child[
POW_DIM * inner_nodeIndex + inner_childNumber];
532 if (inner_childIndex != -1 && inner_childIndex != (index)) {
535 noOfInteractions] = inner_childIndex;
539 cudaTerminate(
"noOfInteractions = %i > MAX_NUM_INTERACTIONS = %i\n",
544 inner_currentChildNumber[inner_depth] = inner_childNumber;
545 inner_currentNodeIndex[inner_depth] = inner_nodeIndex;
548 inner_childNumber = 0;
549 inner_nodeIndex = inner_childIndex;
555 }
while (inner_depth >= 0);
561 outer_currentChildNumber[depth] = childNumber;
562 outer_currentNodeIndex[depth] = nodeIndex;
563 outer_currentNodeLevel[depth] = level;
571 nodeIndex = childIndex;
579 }
while (depth >= 0);
581 particles->
noi[index] = noOfInteractions;
595 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
596 integer stride = blockDim.x * gridDim.x;
599 integer childNumber, nodeIndex, depth, childIndex;
601 register real dx, x_radius;
603 register real dy, y_radius;
605 register real dz, z_radius;
610 real d, r, interactionDistance;
614 extern __shared__
int buffer[];
616 integer *currentChildNumber = (
int*)¤tNodeIndex[(10 + threadIdx.x) *
MAX_DEPTH];
632 currentNodeIndex[depth] = 0;
633 currentChildNumber[depth] = 0;
634 noOfInteractions = 0;
636 x_radius = 0.5 * (*
tree->maxX - (*
tree->minX));
638 y_radius = 0.5 * (*
tree->maxY - (*
tree->minY));
640 z_radius = 0.5 * (*
tree->maxZ - (*
tree->minZ));
653 interactionDistance = (r + particles->
sml[bodyIndex + offset]);
656 childNumber = currentChildNumber[depth];
657 nodeIndex = currentNodeIndex[depth];
659 while (childNumber <
POW_DIM) {
660 childIndex =
tree->child[
POW_DIM * nodeIndex + childNumber];
663 if (childIndex != -1 && childIndex != (bodyIndex + offset)) {
664 dx = particles->
x[bodyIndex + offset] - particles->
x[childIndex];
666 dy = particles->
y[bodyIndex + offset] - particles->
y[childIndex];
668 dz = particles->
z[bodyIndex + offset] - particles->
z[childIndex];
678 d = dx*dx + dy*dy + dz*dz;
681 if ((bodyIndex + offset) % 1000 == 0) {
685 if (d < (particles->
sml[bodyIndex + offset] * particles->
sml[bodyIndex + offset])
686 && d < (particles->
sml[childIndex] * particles->
sml[childIndex])) {
689 noOfInteractions] = childIndex;
695 particles->
nodeType[childIndex] >= 1) {
699 particles->
nodeType[childIndex] >= 1) {
704 particles->
nodeType[childIndex] >= 1) {
707 currentChildNumber[depth] = childNumber;
708 currentNodeIndex[depth] = nodeIndex;
711 interactionDistance = (r + particles->
sml[bodyIndex + offset]);
717 nodeIndex = childIndex;
724 interactionDistance = (r + particles->
sml[bodyIndex + offset]);
726 }
while (depth >= 0);
728 particles->
noi[bodyIndex + offset] = noOfInteractions;
743 int i, inc, nodeIndex, depth, childNumber, child;
746 int numberOfInteractions;
748 real x, dx, interactionDistance, r, radius, d;
765 x_radius = (*
tree->maxX - (*
tree->minX));
767 y_radius = (*
tree->maxY - (*
tree->minY));
769 z_radius = (*
tree->maxZ - (*
tree->minZ));
782 inc = blockDim.x * gridDim.x;
784 for (i = threadIdx.x + blockIdx.x * blockDim.x; i <
numParticlesLocal; i += inc) {
809 htmp = particles->
sml[i];
813 numberOfInteractions = 0;
816 currentNodeIndex[depth] = 0;
817 currentChildNumber[depth] = 0;
818 numberOfInteractions = 0;
820 interactionDistance = (r + htmp);
823 childNumber = currentChildNumber[depth];
824 nodeIndex = currentNodeIndex[depth];
826 while (childNumber <
POW_DIM) {
827 child =
tree->child[
POW_DIM * nodeIndex + childNumber];
830 if (child != -1 && child != i) {
831 dx = x - particles->
x[child];
833 dy = y - particles->
y[child];
835 dz = z - particles->
z[child];
846 htmpj = particles->
sml[child];
848 if (d < htmp*htmp && d < htmpj*htmpj) {
849 numberOfInteractions++;
860 ) || particles->
nodeType[child] >= 1) {
862 currentChildNumber[depth] = childNumber;
863 currentNodeIndex[depth] = nodeIndex;
871 interactionDistance = (r + htmp);
905 interactionDistance = (r + htmp);
906 }
while (depth >= 0);
916 particles->
sml[i] = htmp;
920 if (numberOfInteractions < 1)
921 numberOfInteractions = 1;
927 if (numberOfInteractions < 1)
928 numberOfInteractions = 1;
937 printf(
"+++ particle: %d it: %d htmp: %e htmpold: %e wanted: %d current: %d mId: %d uid: %i (%e, %e, %e) n = %i\n", i, nit,
946 printf(
"+++ particle: %d it: %d htmp: %e htmpold: %e wanted: %d current: %d mId: %d uid: %i (%e, %e, %e) n = %i\n",
952 cudaTerminate(
"numberOfInteractions = %i > MAX_NUM_INTERACTIONS = %i\n", numberOfInteractions,
966 register int i, inc, nodeIndex, depth, childNumber, child;
968 register real x, dx, interactionDistance, r, d;
978 register int currentNodeIndex[
MAX_DEPTH];
979 register int currentChildNumber[
MAX_DEPTH];
980 register int numberOfInteractions;
989 currentNodeIndex[depth] = 0;
990 currentChildNumber[depth] = 0;
991 numberOfInteractions = 0;
993 sml = particles->
sml[i];
994 particles->
noi[i] = 0;
995 interactionDistance = (r + sml);
998 childNumber = currentChildNumber[depth];
999 nodeIndex = currentNodeIndex[depth];
1000 while (childNumber <
POW_DIM) {
1001 child =
tree->child[
POW_DIM * nodeIndex + childNumber];
1003 if (child != -1 && child != i) {
1004 dx = x - particles->
x[child];
1006 dy = y - particles->
y[child];
1008 dz = z - particles->
z[child];
1023 smlj = particles->
sml[child];
1025 if (d < sml*sml && d < smlj*smlj) {
1027 numberOfInteractions++;
1052 currentChildNumber[depth] = childNumber;
1053 currentNodeIndex[depth] = nodeIndex;
1056 interactionDistance = (r + sml);
1068 interactionDistance = (r + sml);
1069 }
while (depth >= 0);
1072 printf(
"ERROR: Maximum number of interactions exceeded: %d / %d\n", numberOfInteractions,
MAX_NUM_INTERACTIONS);
1077 particles->
noi[i] = numberOfInteractions;
1083 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1084 integer stride = blockDim.x * gridDim.x;
1101 if (curveType == Curve::Type::lebesgue) {
1108 if (proc != subDomainKeyTree->
rank) {
1117 lowestDomainList->relevantDomainListOriginalIndex[domainIndex] = index + offset;
1133 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1134 integer stride = blockDim.x * gridDim.x;
1143 real dx, min_x, max_x;
1145 real dy, min_y, max_y;
1147 real dz, min_z, max_z;
1152 while ((bodyIndex + offset) < n) {
1155 min_x = *
tree->minX;
1156 max_x = *
tree->maxX;
1158 min_y = *
tree->minY;
1159 max_y = *
tree->maxY;
1161 min_z = *
tree->minZ;
1162 max_z = *
tree->maxZ;
1174 for (
int j = 0; j <
lowestDomainList->relevantDomainListLevels[relevantIndex]; j++) {
1176 if (particles->
x[
lowestDomainList->relevantDomainListIndices[relevantIndex]] < 0.5 * (min_x + max_x)) {
1178 max_x = 0.5 * (min_x + max_x);
1180 min_x = 0.5 * (min_x + max_x);
1183 if (particles->
y[
lowestDomainList->relevantDomainListIndices[relevantIndex]] < 0.5 * (min_y + max_y)) {
1185 max_y = 0.5 * (min_y + max_y);
1187 min_y = 0.5 * (min_y + max_y);
1190 if (particles->
z[
lowestDomainList->relevantDomainListIndices[relevantIndex]] < 0.5 * (min_z + max_z)) {
1192 max_z = 0.5 * (min_z + max_z);
1194 min_z = 0.5 * (min_z + max_z);
1201 if (particles->
x[bodyIndex + offset] < min_x) {
1203 dx = particles->
x[bodyIndex + offset] - min_x;
1204 }
else if (particles->
x[bodyIndex + offset] > max_x) {
1206 dx = particles->
x[bodyIndex + offset] - max_x;
1213 if (particles->
y[bodyIndex + offset] < min_y) {
1215 dy = particles->
y[bodyIndex + offset] - min_y;
1216 }
else if (particles->
y[bodyIndex + offset] > max_y) {
1218 dy = particles->
y[bodyIndex + offset] - max_y;
1225 if (particles->
z[bodyIndex + offset] < min_z) {
1227 dz = particles->
z[bodyIndex + offset] - min_z;
1228 }
else if (particles->
z[bodyIndex + offset] > max_z) {
1230 dz = particles->
z[bodyIndex + offset] - max_z;
1243 d = dx*dx + dy*dy + dz*dz;
1251 if (d < (searchRadius * searchRadius)) {
1255 sendIndices[bodyIndex + offset] = 1;
1276 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1277 integer stride = blockDim.x * gridDim.x;
1287 real dx, min_x, max_x;
1289 real dy, min_y, max_y;
1291 real dz, min_z, max_z;
1298 while ((bodyIndex + offset) < n) {
1302 for (
int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1304 if (
lowestDomainList->relevantDomainListProcess[relevantIndex] != relevantProc) {
1312 currentParticleIndex = bodyIndex + offset;
1328 if (particles->
x[currentParticleIndex] < min_x) {
1330 dx = particles->
x[currentParticleIndex] - min_x;
1331 }
else if (particles->
x[currentParticleIndex] > max_x) {
1333 dx = particles->
x[currentParticleIndex] - max_x;
1340 if (particles->
y[currentParticleIndex] < min_y) {
1342 dy = particles->
y[currentParticleIndex] - min_y;
1343 }
else if (particles->
y[currentParticleIndex] > max_y) {
1345 dy = particles->
y[currentParticleIndex] - max_y;
1352 if (particles->
z[currentParticleIndex] < min_z) {
1354 dz = particles->
z[currentParticleIndex] - min_z;
1355 }
else if (particles->
z[currentParticleIndex] > max_z) {
1357 dz = particles->
z[currentParticleIndex] - max_z;
1370 d = dx*dx + dy*dy + dz*dz;
1381 if (d < (searchRadius * searchRadius)) {
1386 sendIndices[currentParticleIndex] = 1;
1406 int bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1407 int stride = blockDim.x * gridDim.x;
1410 int currentParticleIndex, particleLevel, domainListLevel, currentDomainListIndex, childIndex, currentProc;
1429 while ((bodyIndex + offset) < n) {
1433 for (
int relevantIndex = 0; relevantIndex < relevantIndicesCounter; relevantIndex++) {
1443 currentParticleIndex = bodyIndex + offset;
1473 if (particles->
x[currentParticleIndex] < min_x) {
1474 dx = particles->
x[currentParticleIndex] - min_x;
1475 }
else if (particles->
x[currentParticleIndex] > max_x) {
1476 dx = particles->
x[currentParticleIndex] - max_x;
1481 if (particles->
y[currentParticleIndex] < min_y) {
1482 dy = particles->
y[currentParticleIndex] - min_y;
1483 }
else if (particles->
y[currentParticleIndex] > max_y) {
1484 dy = particles->
y[currentParticleIndex] - max_y;
1489 if (particles->
z[currentParticleIndex] < min_z) {
1490 dz = particles->
z[currentParticleIndex] - min_z;
1491 }
else if (particles->
z[currentParticleIndex] > max_z) {
1492 dz = particles->
z[currentParticleIndex] - max_z;
1504 d = dx * dx + dy * dy + dz * dz;
1508 if (d < (searchRadius * searchRadius)) {
1512 added = added | (1 << currentProc);
1513 sendIndices[currentParticleIndex] = sendIndices[currentParticleIndex] | (1 << currentProc);
1529 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1530 integer stride = blockDim.x*gridDim.x;
1535 while ((bodyIndex + offset) < length) {
1537 if (sendIndices[bodyIndex + offset] == 1) {
1538 particleInsertIndex = atomicAdd(particlesCount, 1);
1555 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1556 integer stride = blockDim.x*gridDim.x;
1563 if ((sendIndices[bodyIndex + offset] >> currentProc) & 1) {
1565 particleInsertIndex = atomicAdd(particlesCount, 1);
1582 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1583 integer stride = blockDim.x * gridDim.x;
1592 real dx, min_x, max_x;
1594 real dy, min_y, max_y;
1596 real dz, min_z, max_z;
1605 if ((bodyIndex + offset) == 0) {
1606 printf(
"sphParticles2SendKernel: insertOffset = %i\n", insertOffset);
1611 for (
int i = 0; i < subDomainKeyTree->
numProcesses; i++) {
1612 alreadyInserted[i] = 0;
1618 min_x = *
tree->minX;
1619 max_x = *
tree->maxX;
1621 min_y = *
tree->minY;
1622 max_y = *
tree->maxY;
1624 min_z = *
tree->minZ;
1625 max_z = *
tree->maxZ;
1633 if (proc != subDomainKeyTree->
rank && alreadyInserted[proc] != 1) {
1673 if (particles->
x[
lowestDomainList->domainListIndices[i]] < 0.5 * (min_x + max_x)) {
1675 max_x = 0.5 * (min_x + max_x);
1677 min_x = 0.5 * (min_x + max_x);
1680 if (particles->
y[
lowestDomainList->domainListIndices[i]] < 0.5 * (min_y + max_y)) {
1682 max_y = 0.5 * (min_y + max_y);
1684 min_y = 0.5 * (min_y + max_y);
1687 if (particles->
z[
lowestDomainList->domainListIndices[i]] < 0.5 * (min_z + max_z)) {
1689 max_z = 0.5 * (min_z + max_z);
1691 min_z = 0.5 * (min_z + max_z);
1698 if (particles->
x[bodyIndex + offset] < min_x) {
1700 dx = particles->
x[bodyIndex + offset] - min_x;
1701 }
else if (particles->
x[bodyIndex + offset] > max_x) {
1703 dx = particles->
x[bodyIndex + offset] - max_x;
1710 if (particles->
y[bodyIndex + offset] < min_y) {
1712 dy = particles->
y[bodyIndex + offset] - min_y;
1713 }
else if (particles->
y[bodyIndex + offset] > max_y) {
1715 dy = particles->
y[bodyIndex + offset] - max_y;
1722 if (particles->
z[bodyIndex + offset] < min_z) {
1724 dz = particles->
z[bodyIndex + offset] - min_z;
1725 }
else if (particles->
z[bodyIndex + offset] > max_z) {
1727 dz = particles->
z[bodyIndex + offset] - max_z;
1740 d = dx * dx + dy * dy + dz * dz;
1743 if (d < (particles->
sml[bodyIndex + offset] * particles->
sml[bodyIndex + offset])) {
1748 insertIndex = atomicAdd(&sendCount[proc], 1);
1749 if (insertIndex > 100000) {
1750 printf(
"Attention!!! insertIndex: %i\n", insertIndex);
1752 insertIndexOffset = insertOffset * proc;
1753 toSend[insertIndexOffset + insertIndex] = bodyIndex + offset;
1762 alreadyInserted[proc] = 1;
1778 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1779 integer stride = blockDim.x * gridDim.x;
1782 while ((bodyIndex + offset) < count) {
1783 toSendCollected[bodyIndex + offset] = toSend[bodyIndex + offset];
1793 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1794 integer stride = blockDim.x * gridDim.x;
1802 if ((bodyIndex + offset) == 0) {
1803 printf(
"[rank %i] sendCount(%i, %i)\n", subDomainKeyTree->
rank, sendCount[0], sendCount[1]);
1808 while ((bodyIndex + offset) < totalSendCount) {
1809 toSend[bodyIndex + offset] = entry[sendIndices[bodyIndex + offset]];
1810 printf(
"toSend[%i] = %f sendIndices = %i (insertOffset = %i)\n", bodyIndex + offset, toSend[bodyIndex + offset],
1811 sendIndices[bodyIndex + offset], insertOffset);
1873#define COMPUTE_DIRECTLY 0
1879 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1880 integer stride = blockDim.x * gridDim.x;
1889 bool newBody =
true;
1908 offset =
tree->toDeleteLeaf[0];
1910 while ((bodyIndex + offset) <
tree->toDeleteLeaf[1]) {
1921 min_x = *
tree->minX;
1922 max_x = *
tree->maxX;
1923 x = particles->
x[bodyIndex + offset];
1925 y = particles->
y[bodyIndex + offset];
1926 min_y = *
tree->minY;
1927 max_y = *
tree->maxY;
1929 z = particles->
z[bodyIndex + offset];
1930 min_z = *
tree->minZ;
1931 max_z = *
tree->maxZ;
1939 if (x < 0.5 * (min_x + max_x)) {
1941 max_x = 0.5 * (min_x + max_x);
1944 min_x = 0.5 * (min_x + max_x);
1948 if (y < 0.5 * (min_y + max_y)) {
1950 max_y = 0.5 * (min_y + max_y);
1953 min_y = 0.5 * (min_y + max_y);
1957 if (z < 0.5 * (min_z + max_z)) {
1959 max_z = 0.5 * (min_z + max_z);
1962 min_z = 0.5 * (min_z + max_z);
1968 register integer childIndex = childList[temp*
POW_DIM + childPath];
1971 while (childIndex >= m) {
1984 if (x < 0.5 * (min_x + max_x)) {
1986 max_x = 0.5 * (min_x + max_x);
1989 min_x = 0.5 * (min_x + max_x);
1992 if (y < 0.5 * (min_y + max_y)) {
1994 max_y = 0.5 * (min_y + max_y);
1997 min_y = 0.5 * (min_y + max_y);
2000 if (z < 0.5 * (min_z + max_z)) {
2002 max_z = 0.5 * (min_z + max_z);
2005 min_z = 0.5 * (min_z + max_z);
2011 if (particles->
mass[bodyIndex + offset] != 0) {
2025 atomicAdd(&particles->
mass[temp], particles->
mass[bodyIndex + offset]);
2028 atomicAdd(&
tree->count[temp], 1);
2029 childIndex = childList[
POW_DIM * temp + childPath];
2035 if (childIndex != -2) {
2039 if (atomicCAS((
int *)&childList[locked], childIndex, -2) == childIndex) {
2042 if (childIndex == -1) {
2044 childList[locked] = bodyIndex + offset;
2045 particles->
level[bodyIndex + offset] = level + 1;
2049 if (childIndex >= n) {
2050 printf(
"ATTENTION!\n");
2053 while (childIndex >= 0 && childIndex < n) {
2057 patch =
min(patch, cell);
2059 if (patch != cell) {
2060 childList[
POW_DIM * temp + childPath] = cell;
2068 cudaAssert(
"buildTree: level = %i for index %i (%e)", level,
2069 bodyIndex + offset, particles->
x[bodyIndex + offset]);
2071 cudaAssert(
"buildTree: level = %i for index %i (%e, %e)", level,
2072 bodyIndex + offset, particles->
x[bodyIndex + offset],
2073 particles->
y[bodyIndex + offset]);
2075 cudaAssert(
"buildTree: level = %i for index %i (%e, %e, %e)", level,
2076 bodyIndex + offset, particles->
x[bodyIndex + offset],
2077 particles->
y[bodyIndex + offset],
2078 particles->
z[bodyIndex + offset]);
2084 if (particles->
x[childIndex] < 0.5 * (min_x + max_x)) { childPath += 1; }
2086 if (particles->
y[childIndex] < 0.5 * (min_y + max_y)) { childPath += 2; }
2088 if (particles->
z[childIndex] < 0.5 * (min_z + max_z)) { childPath += 4; }
2109 particles->
mass[cell] += particles->
mass[childIndex];
2112 particles->
x[cell] = 0.5 * (min_x + max_x);
2115 particles->
y[cell] = 0.5 * (min_y + max_y);
2118 particles->
z[cell] = 0.5 * (min_z + max_z);
2124 tree->count[cell] +=
tree->count[childIndex];
2126 childList[
POW_DIM * cell + childPath] = childIndex;
2127 particles->
level[cell] = level;
2128 particles->
level[childIndex] += 1;
2129 tree->start[cell] = -1;
2132 if (particles->
level[cell] >= particles->
level[childIndex]) {
2133 printf(
"lvl: %i vs. %i\n", particles->
level[cell], particles->
level[childIndex]);
2144 if (x < 0.5 * (min_x + max_x)) {
2146 max_x = 0.5 * (min_x + max_x);
2148 min_x = 0.5 * (min_x + max_x);
2152 if (y < 0.5 * (min_y + max_y)) {
2154 max_y = 0.5 * (min_y + max_y);
2156 min_y = 0.5 * (min_y + max_y);
2160 if (z < 0.5 * (min_z + max_z)) {
2162 max_z = 0.5 * (min_z + max_z);
2164 min_z = 0.5 * (min_z + max_z);
2170 if (particles->
mass[bodyIndex + offset] != 0) {
2181 particles->
mass[cell] += particles->
mass[bodyIndex + offset];
2184 tree->count[cell] +=
tree->count[bodyIndex + offset];
2185 childIndex = childList[
POW_DIM * temp + childPath];
2188 childList[
POW_DIM * temp + childPath] = bodyIndex + offset;
2189 particles->
level[bodyIndex + offset] = level + 1;
2192 childList[locked] = patch;
2553 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2554 integer stride = blockDim.x * gridDim.x;
2565 while ((bodyIndex + offset) <
tree->toDeleteNode[1]) {
2567 if (particles->
level[bodyIndex + offset] == level) {
2569 if (particles->
level[bodyIndex + offset] == -1 || particles->
level[bodyIndex + offset] > 21) {
2570 printf(
"level[%i] = %i!!!\n", bodyIndex + offset, particles->
level[bodyIndex + offset]);
2573 particles->
mass[bodyIndex + offset] = 0.;
2574 particles->
x[bodyIndex + offset] = 0.;
2576 particles->
y[bodyIndex + offset] = 0.;
2578 particles->
z[bodyIndex + offset] = 0.;
2582 for (
int child = 0; child <
POW_DIM; ++child) {
2583 index =
POW_DIM * (bodyIndex + offset) + child;
2584 if (
tree->child[index] != -1) {
2592 particles->
mass[bodyIndex + offset] += particles->
mass[
tree->child[index]];
2596 if (particles->
mass[bodyIndex + offset] > 0.) {
2597 particles->
x[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
2599 particles->
y[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
2601 particles->
z[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
2623 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
2624 integer stride = blockDim.x * gridDim.x;
2627 int lowestDomainIndex;
2634 real min_x, max_x, dx;
2636 real min_y, max_y, dy;
2638 real min_z, max_z, dz;
2642 while ((bodyIndex + offset) < n) {
2653 if (proc != subDomainKeyTree->
rank) {
2655 min_x = *
tree->minX;
2656 max_x = *
tree->maxX;
2658 min_y = *
tree->minY;
2659 max_y = *
tree->maxY;
2661 min_z = *
tree->minZ;
2662 max_z = *
tree->maxZ;
2671 max_x = 0.5 * (min_x + max_x);
2674 min_x = 0.5 * (min_x + max_x);
2677 if ((path >> 1) & 1) {
2679 max_y = 0.5 * (min_y + max_y);
2682 min_y = 0.5 * (min_y + max_y);
2685 if ((path >> 2) & 1) {
2687 max_z = 0.5 * (min_z + max_z);
2690 min_z = 0.5 * (min_z + max_z);
2725 if (particles->
x[bodyIndex + offset] < min_x) {
2727 dx = particles->
x[bodyIndex + offset] - min_x;
2728 }
else if (particles->
x[bodyIndex + offset] > max_x) {
2730 dx = particles->
x[bodyIndex + offset] - max_x;
2737 if (particles->
y[bodyIndex + offset] < min_y) {
2739 dy = particles->
y[bodyIndex + offset] - min_y;
2740 }
else if (particles->
y[bodyIndex + offset] > max_y) {
2742 dy = particles->
y[bodyIndex + offset] - max_y;
2749 if (particles->
z[bodyIndex + offset] < min_z) {
2751 dz = particles->
z[bodyIndex + offset] - min_z;
2752 }
else if (particles->
z[bodyIndex + offset] > max_z) {
2754 dz = particles->
z[bodyIndex + offset] - max_z;
2769 if (distance < particles->sml[bodyIndex + offset]) {
2770 searchRadius = particles->
sml[bodyIndex + offset] - distance;
2776 searchRadii[bodyIndex + offset] = searchRadius;
3065 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
3066 integer stride = blockDim.x * gridDim.x;
3071 if ((bodyIndex + offset) % 1000 == 0) {
3073 printf(
"particles->noi[%i] = %i\n", bodyIndex + offset, particles->
noi[bodyIndex + offset]);
3107 Logger(
INFO) <<
"calling new fixed radius...";
3150 lowestDomainList, sendIndices, searchRadius, n, m, relevantIndex, curveType);
3159 lowestDomainList, sendIndices, searchRadius, n, m, relevantProc, relevantIndicesCounter,
3160 relevantIndexOld, curveType);
3169 domainList, sendIndices, searchRadius, n, m, relevantProc, relevantIndicesCounter, curveType);
3177 sendIndices,
particles2Send, particlesCount, n, length, curveType);
3187 treeIndex, currentProc, curveType);
3213 entry, toSend, sendIndices, sendCount, totalSendCount, insertOffset);
integer * relevantDomainListOriginalIndex
integer * relevantDomainListIndices
concentrate domain list nodes, usable to reduce domain list indices in respect to some criterion
integer * relevantDomainListProcess
Execution policy/instruction for CUDA kernel execution.
Particle(s) class based on SoA (Structur of Arrays).
integer * noi
(pointer to) number of interactions (array)
CUDA_CALLABLE_MEMBER real weightedEntry(integer index, Entry::Name entry)
integer * materialId
(pointer to) material identifier (array)
real * x
(pointer to) x position (array)
integer * level
(pointer to) level of the (pseudo-)particles
idInteger * uid
(pointer to) unique identifier (array)
integer * nodeType
(pointer to) node type
real * y
(pointer to) y position (array)
real * sml
(pointer to) smoothing length (array)
real * mass
(pointer to) mass (array)
real * z
(pointer to) z position (array)
SubDomainKeyTree class handling rank, number of processes and ranges.
CUDA_CALLABLE_MEMBER integer key2proc(keyType key)
Compute particle's MPI process belonging by it's key.
integer numProcesses
MPI number of processes.
#define cudaTerminate(...)
__global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Kernel to create the lowest domain list.
__global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *pseudoParticles2Send, integer *pseudoParticlesLevel, integer *particlesCount, integer *pseudoParticlesCount, integer n, integer length, Curve::Type curveType)
Collect the send indices.
CUDA_CALLABLE_MEMBER integer key2proc(keyType key, SubDomainKeyTree *subDomainKeyTree)
Convert the key to the corresponding process.
CUDA_CALLABLE_MEMBER keyType lebesgue2hilbert(keyType lebesgue, integer maxLevel)
Convert a Lebesgue key to a Hilbert key.
__global__ void info(Material *material)
Debug kernel giving information about material(s).
const char *const compTheta
const char *const symbolicForce
const char *const insertReceivedParticles
const char *const fixedRadiusNN
const char *const determineSearchRadii
const char *const numParticlesLocal
const char *const numParticles
real symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
real symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, integer relevantIndexOld, Curve::Type curveType)
real fixedRadiusNN_sharedMemory(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_sharedMemory().
real compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, Curve::Type curveType)
Wrapper for SPH::Kernel::compTheta().
real collectSendEntriesBackup(SubDomainKeyTree *subDomainKeyTree, real *entry, real *toSend, integer *sendIndices, integer *sendCount, integer totalSendCount, integer insertOffset)
Wrapper for SPH::Kernel::collectSendEntriesBackup().
real insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m)
Wrapper for SPH::Kernel::insertReceivedParticles().
real collectSendIndices_test2(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *particlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
real collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *particlesCount, integer n, integer length, Curve::Type curveType)
Wrapper for SPH::Kernel::collectSendIndices().
real fixedRadiusNN_bruteForce(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_bruteForce().
real collectSendIndicesBackup(integer *toSend, integer *toSendCollected, integer count)
Wrapper for SPH::Kernel::collectSendIndicesBackup().
real fixedRadiusNN(Tree *tree, Particles *particles, integer *interactions, real radius, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN().
real fixedRadiusNN_withinBox(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_withinBox().
real particles2Send(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer maxLevel, integer *toSend, integer *sendCount, integer *alreadyInserted, integer insertOffset, integer numParticlesLocal, integer numParticles, integer numNodes, real radius, Curve::Type curveType=Curve::lebesgue)
Wrapper for SPH::Kernel::particles2Send().
real symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantIndex, Curve::Type curveType)
Wrapper for SPH::Kernel::symbolicForce().
real info(Tree *tree, Particles *particles, Helper *helper, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::info().
real determineSearchRadii(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, real *searchRadii, int n, int m, Curve::Type curveType)
Wrapper for SPH::Kernel::determineSearchRadii().
real fixedRadiusNN_variableSML(Material *materials, Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Wrapper for SPH::Kernel::fixedRadiusNN_variableSML().
real calculateCentersOfMass(Tree *tree, Particles *particles, integer level)
Wrapper for SPH::Kernel::calculateCentersOfMass().
__device__ void redoNeighborSearch(Tree *tree, Particles *particles, int particleId, int *interactions, real radius, integer numParticles, integer numNodes)
Redo the neighbor search (FRNN).
__global__ void compTheta(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, Curve::Type curveType)
Find the relevant (lowest) domain list nodes as preparation for finding particles to be exchanged bet...
__global__ void calculateCentersOfMass(Tree *tree, Particles *particles, integer level)
__global__ void symbolicForce(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantIndex, Curve::Type curveType)
Find the particles that need to be exchanged between processes to grant correctness of SPH forces.
__global__ void symbolicForce_test2(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, Curve::Type curveType)
__global__ void collectSendIndices_test2(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *particlesCount, integer numParticlesLocal, integer numParticles, integer treeIndex, int currentProc, Curve::Type curveType)
__global__ void insertReceivedParticles(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m)
Insert the received particles into the local tree.
__global__ void collectSendIndicesBackup(integer *toSend, integer *toSendCollected, integer count)
__global__ void fixedRadiusNN_variableSML(Material *materials, Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Fixed-radius near neighbor search for iteratively finding appropriate smoothing length.
__global__ void particles2Send(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer maxLevel, integer *toSend, integer *sendCount, integer *alreadyInserted, integer insertOffset, integer numParticlesLocal, integer numParticles, integer numNodes, real radius, Curve::Type curveType=Curve::lebesgue)
__global__ void info(Tree *tree, Particles *particles, Helper *helper, integer numParticlesLocal, integer numParticles, integer numNodes)
Info/Debug kernel.
__global__ void collectSendIndices(Tree *tree, Particles *particles, integer *sendIndices, integer *particles2Send, integer *particlesCount, integer n, integer length, Curve::Type curveType)
Collect the found particles into contiguous memory in order to facilitate sending via MPI.
__global__ void fixedRadiusNN(Tree *tree, Particles *particles, integer *interactions, real radius, integer numParticlesLocal, integer numParticles, integer numNodes)
Fixed-radius near neighbor search (default method via explicit stack).
__global__ void fixedRadiusNN_withinBox(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Fixed-radius near neighbor search (nested stack method).
__global__ void determineSearchRadii(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, real *searchRadii, int n, int m, Curve::Type curveType)
Determine the search radius needed for SPH::Kernel::symbolicForce().
__global__ void symbolicForce_test(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *lowestDomainList, integer *sendIndices, real searchRadius, integer n, integer m, integer relevantProc, integer relevantIndicesCounter, integer relevantIndexOld, Curve::Type curveType)
__global__ void collectSendEntriesBackup(SubDomainKeyTree *subDomainKeyTree, real *entry, real *toSend, integer *sendIndices, integer *sendCount, integer totalSendCount, integer insertOffset)
__global__ void fixedRadiusNN_sharedMemory(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Fixed-radius near neighbor search (brute-force method).
__global__ void fixedRadiusNN_bruteForce(Tree *tree, Particles *particles, integer *interactions, integer numParticlesLocal, integer numParticles, integer numNodes)
Fixed-radius near neighbor search (brute-force method).
SPH related functions and kernels.
void exchangeParticleEntry(SubDomainKeyTree *subDomainKeyTree, real *entry, real *toSend, integer *sendLengths, integer *receiveLengths, integer numParticlesLocal)
__device__ real min(real a, real b)
Minimum value out of two floating point values.
__device__ real sqrt(real a)
Square root of a floating point value.
__device__ real abs(real a)
Absolute value of a floating point value.
__device__ real max(real a, real b)
Maximum value out of two floating point values.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
#define MAX_NUM_INTERACTIONS
#define DIM
Dimension of the problem.
#define MAX_VARIABLE_SML_ITERATIONS
#define TOLERANCE_WANTED_NUMBER_OF_INTERACTIONS