1#include "../../include/subdomain_key_tree/subdomain.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
7 for (
int i=0; i<maxLevel; i++) {
8 level[i] = (int)(key >> (maxLevel*
DIM -
DIM*(i+1)) & (int)(
POW_DIM - 1));
10 for (
int i=0; i<=maxLevel; i++) {
11 keyAsChar[2*i] = level[i] +
'0';
12 keyAsChar[2*i+1] =
'|';
14 keyAsChar[2*maxLevel+3] =
'\0';
18 return subDomainKeyTree->
key2proc(key);
26 integer *procParticleCounter) : rank(rank),
27 numProcesses(numProcesses), range(range),
28 procParticleCounter(procParticleCounter) {
71 if (key >=
range[proc] && key <
range[proc + 1]) {
75 printf(
"ERROR: key2proc(k=%lu): -1!\n", key);
88 p2 =
key2proc(key | ~(~0UL <<
DIM * (maxLevel - level)));
101 keyType keyMax = (shiftValue << toShift);
105 keyType keyMax = (shiftValue << toShift);
113 p2 =
key2proc(hilbert | (keyMax >> (
DIM*level+1)));
124 printf(
"Curve type not available!\n");
138 integer *procParticleCounter) {
139 subDomainKeyTree->
set(rank, numProcesses, range, procParticleCounter);
143 printf(
"device: subDomainKeyTree->rank = %i\n", subDomainKeyTree->
rank);
144 printf(
"device: subDomainKeyTree->numProcesses = %i\n", subDomainKeyTree->
numProcesses);
176 childIndex =
tree->child[
POW_DIM*childIndex+path[j]];
177 if (childIndex < n) {
178 if (childIndex == -1) {
182 particles->
level[cell] = j + 1;
185 particles->
nodeType[childIndex] = 1;
188 printf(
"adding node index %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i\n",
189 temp, cell, path[j], j, particles->
x[childIndex], particles->
y[childIndex], particles->
z[childIndex],
190 particles->
level[cell]);
210 for (
int k=0; k<=j; k++) {
212 currentChild = path[k];
214 if (currentChild % 2 != 0) {
215 max_x = 0.5 * (min_x + max_x);
219 min_x = 0.5 * (min_x + max_x);
222 if (currentChild % 2 == 0 && currentChild % 4 != 0) {
223 max_y = 0.5 * (min_y + max_y);
227 min_y = 0.5 * (min_y + max_y);
230 if (currentChild == 4) {
231 max_z = 0.5 * (min_z + max_z);
235 min_z = 0.5 * (min_z + max_z);
242 if (particles->
x[childIndex] < 0.5 * (min_x + max_x)) {
248 if (particles->
y[childIndex] < 0.5 * (min_y + max_y)) {
254 if (particles->
z[childIndex] < 0.5 * (min_z + max_z)) {
263 particles->
x[cell] = particles->
x[childIndex];
266 particles->
y[cell] = particles->
y[childIndex];
269 particles->
z[cell] = particles->
z[childIndex];
273 particles->
mass[cell] = particles->
mass[childIndex];
275 particles->
level[cell] = particles->
level[childIndex];
276 particles->
level[childIndex] += 1;
279 printf(
"adding node in between for index %i, temp = %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i vs %i\n",
280 childIndex, temp, cell, childPath, j, particles->
x[childIndex], particles->
y[childIndex], particles->
z[childIndex],
281 particles->
level[cell], particles->
level[childIndex]);
285 tree->child[
POW_DIM * cell + childPath] = childIndex;
289 particles->
nodeType[childIndex] = 1;
296 if (particles->
nodeType[childIndex] >= 1) {
308 particles->
nodeType[childIndex] = 1;
612 integer index = threadIdx.x + blockIdx.x * blockDim.x;
613 integer stride = blockDim.x * gridDim.x;
664 childIndex =
tree->child[
POW_DIM * childIndex + path[j]];
668 for (k = 0; k < j; k++) {
670 currentChild = path[k];
671 if (currentChild & 1) {
672 max_x = 0.5 * (min_x + max_x);
674 min_x = 0.5 * (min_x + max_x);
677 if ((currentChild >> 1) & 1) {
678 max_y = 0.5 * (min_y + max_y);
681 min_y = 0.5 * (min_y + max_y);
684 if ((currentChild >> 2) & 1) {
685 max_z = 0.5 * (min_z + max_z);
687 min_z = 0.5 * (min_z + max_z);
693 if (childIndex < n) {
694 if (childIndex == -1) {
699 particles->
level[cell] = j;
703 particles->
nodeType[childIndex] = 1;
705 domainList->
borders[(index + offset) * 2 *
DIM] = min_x;
706 domainList->
borders[(index + offset) * 2 *
DIM + 1] = max_x;
708 domainList->
borders[(index + offset) * 2 *
DIM + 2] = min_y;
709 domainList->
borders[(index + offset) * 2 *
DIM + 3] = max_y;
711 domainList->
borders[(index + offset) * 2 *
DIM + 4] = min_z;
712 domainList->
borders[(index + offset) * 2 *
DIM + 5] = max_z;
717 particles->
x[cell] = 0.5 * (min_x + max_x);
720 particles->
y[cell] = 0.5 * (min_y + max_y);
723 particles->
z[cell] = 0.5 * (min_z + max_z);
729 printf(
"adding node index %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i\n",
730 temp, cell, path[j], j, particles->
x[childIndex], particles->
y[childIndex], particles->
z[childIndex],
731 particles->
level[cell]);
736 printf(
"[rank %i] adding domainListIndices[%i] = %i, childIndex = %i, path = %i\n", subDomainKeyTree->
rank,
737 index + offset, domainList->
domainListIndices[index + offset], childIndex, path[j-1]);
750 particles->
x[cell] = particles->
x[childIndex];
753 particles->
y[cell] = particles->
y[childIndex];
756 particles->
z[cell] = particles->
z[childIndex];
761 domainList->
borders[(index + offset) * 2 *
DIM] = min_x;
762 domainList->
borders[(index + offset) * 2 *
DIM + 1] = max_x;
764 domainList->
borders[(index + offset) * 2 *
DIM + 2] = min_y;
765 domainList->
borders[(index + offset) * 2 *
DIM + 3] = max_y;
767 domainList->
borders[(index + offset) * 2 *
DIM + 4] = min_z;
768 domainList->
borders[(index + offset) * 2 *
DIM + 5] = max_z;
772 particles->
level[cell] = particles->
level[childIndex];
774 atomicAdd(&particles->
level[childIndex], 1);
778 if (particles->
x[childIndex] < 0.5 * (min_x + max_x)) {
784 if (particles->
y[childIndex] < 0.5 * (min_y + max_y)) {
790 if (particles->
z[childIndex] < 0.5 * (min_z + max_z)) {
802 printf(
"adding node in between for index %i, temp = %i cell = %i (childPath = %i, j = %i)! x = (%e, %e, %e) level = %i vs %i\n",
803 childIndex, temp, cell, childPath, j, particles->
x[childIndex], particles->
y[childIndex], particles->
z[childIndex],
804 particles->
level[cell], particles->
level[childIndex]);
809 tree->child[
POW_DIM * cell + childPath] = childIndex;
817 particles->
nodeType[childIndex] = 1;
823 printf(
"[rank %i] Mark already available node %i: %i, path = %i (level = %i)\n",
824 subDomainKeyTree->
rank, index + offset, childIndex, path[j-1], level);
827 domainList->
borders[(index + offset) * 2 *
DIM] = min_x;
828 domainList->
borders[(index + offset) * 2 *
DIM + 1] = max_x;
830 domainList->
borders[(index + offset) * 2 *
DIM + 2] = min_y;
831 domainList->
borders[(index + offset) * 2 *
DIM + 3] = max_y;
833 domainList->
borders[(index + offset) * 2 *
DIM + 4] = min_z;
834 domainList->
borders[(index + offset) * 2 *
DIM + 5] = max_z;
838 particles->
nodeType[childIndex] = 1;
853 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
854 integer stride = blockDim.x * gridDim.x;
860 char keyAsChar[21 * 2 + 3];
863 while (bodyIndex + offset < n) {
866 particleKey =
tree->getParticleKey(particles, bodyIndex + offset, maxLevel, curveType);
886 keys[bodyIndex + offset] = particleKey;
900 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
901 integer stride = blockDim.x * gridDim.x;
907 while ((bodyIndex + offset) < n) {
910 key =
tree->getParticleKey(particles, bodyIndex + offset,
MAX_LEVEL, curveType);
912 proc = subDomainKeyTree->
key2proc(key);
924 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
925 integer stride = blockDim.x * gridDim.x;
932 while ((bodyIndex + offset) < n) {
935 key =
tree->getParticleKey(particles, bodyIndex + offset,
MAX_LEVEL, curveType);
937 proc = subDomainKeyTree->
key2proc(key);
949 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
950 integer stride = blockDim.x*gridDim.x;
965 if (particles->
nodeType[domainIndex] == 2) {
970 particles->
x[domainIndex] = (
real)0;
972 particles->
y[domainIndex] = (
real)0;
974 particles->
z[domainIndex] = (
real)0;
977 particles->
mass[domainIndex] = (
real)0;
981 particles->
x[domainIndex] *= particles->
mass[domainIndex];
983 particles->
y[domainIndex] *= particles->
mass[domainIndex];
985 particles->
z[domainIndex] *= particles->
mass[domainIndex];
994 template <
typename T>
998 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
999 integer stride = blockDim.x*gridDim.x;
1006 lowestDomainIndex =
lowestDomainList->domainListIndices[bodyIndex + offset];
1007 if (lowestDomainIndex >= 0) {
1010 buffer[bodyIndex + offset] = particles->
x[lowestDomainIndex];
1014 buffer[bodyIndex + offset] = particles->
y[lowestDomainIndex];
1018 buffer[bodyIndex + offset] = particles->
z[lowestDomainIndex];
1023 buffer[bodyIndex + offset] = particles->
mass[lowestDomainIndex];
1026 printf(
"prepareLowestDomainExchange(): Not available!\n");
1033 template <
typename T>
1037 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1038 integer stride = blockDim.x * gridDim.x;
1052 if (originalIndex == -1) {
1053 cudaTerminate(
"ATTENTION: originalIndex = -1 (index = %i)!\n",
1060 buffer[bodyIndex + offset];
1065 buffer[bodyIndex + offset];
1070 buffer[bodyIndex + offset];
1076 buffer[bodyIndex + offset];
1079 printf(
"Entry not available!\n");
1089 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1090 integer stride = blockDim.x*gridDim.x;
1098 lowestDomainIndex =
lowestDomainList->domainListIndices[bodyIndex + offset];
1111 if (particles->
mass[lowestDomainIndex] > (
real)0) {
1118 particles->
x[lowestDomainIndex] /= particles->
mass[lowestDomainIndex];
1120 particles->
y[lowestDomainIndex] /= particles->
mass[lowestDomainIndex];
1122 particles->
z[lowestDomainIndex] /= particles->
mass[lowestDomainIndex];
1141 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1142 integer stride = blockDim.x*gridDim.x;
1148 while (bodyIndex + offset < *tree->index) {
1149 isDomainList =
false;
1157 if (particles->
nodeType[bodyIndex + offset] >= 1) {
1158 isDomainList =
true;
1161 if (!isDomainList) {
1162 particles->
x[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1164 particles->
y[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1166 particles->
z[bodyIndex + offset] /= particles->
mass[bodyIndex + offset];
1178 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1179 integer stride = blockDim.x*gridDim.x;
1186 real totalMass, x_mass, y_mass, z_mass;
1187 int childToLookAt = 5;
1188 int lowestDomainIndex, childLowestDomain;
1195 if (particles->
nodeType[domainIndex] == 2) {
1250 if (compute && domainList->
domainListLevels[bodyIndex + offset] == level) {
1262 for (
int i=0; i<
POW_DIM; i++) {
1263 particles->
x[domainIndex] += particles->
x[
tree->child[
POW_DIM*domainIndex + i]] *
1266 particles->
y[domainIndex] += particles->
y[
tree->child[
POW_DIM*domainIndex + i]] *
1269 particles->
z[domainIndex] += particles->
z[
tree->child[
POW_DIM*domainIndex + i]] *
1276 if (particles->
mass[domainIndex] > 0.) {
1277 particles->
x[domainIndex] /= particles->
mass[domainIndex];
1279 particles->
y[domainIndex] /= particles->
mass[domainIndex];
1281 particles->
z[domainIndex] /= particles->
mass[domainIndex];
1295 integer bodyIndex = threadIdx.x + blockIdx.x*blockDim.x;
1296 integer stride = blockDim.x*gridDim.x;
1304 while (level >= 0) {
1310 if (particles->
nodeType[domainIndex] == 2) {
1318 if (compute && domainList->
domainListLevels[bodyIndex + offset] == level) {
1320 for (
int i=0; i<
POW_DIM; i++) {
1321 particles->
x[domainIndex] += particles->
x[
tree->child[
POW_DIM*domainIndex + i]] *
1324 particles->
y[domainIndex] += particles->
y[
tree->child[
POW_DIM*domainIndex + i]] *
1327 particles->
z[domainIndex] += particles->
z[
tree->child[
POW_DIM*domainIndex + i]] *
1334 if (particles->
mass[domainIndex] != 0.f) {
1335 particles->
x[domainIndex] /= particles->
mass[domainIndex];
1337 particles->
y[domainIndex] /= particles->
mass[domainIndex];
1339 particles->
z[domainIndex] /= particles->
mass[domainIndex];
1359 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1360 integer stride = blockDim.x * gridDim.x;
1367 if (bodyIndex + offset == 0) {
1368 *
tree->index =
tree->toDeleteNode[0];
1373 for (
int i=0; i<
POW_DIM; i++) {
1375 || (
tree->child[
POW_DIM * domainIndex + i] >=
tree->toDeleteLeaf[0] &&
1400 offset =
tree->toDeleteLeaf[0];
1402 while ((bodyIndex + offset) >=
tree->toDeleteLeaf[0] && (bodyIndex + offset) <
tree->toDeleteLeaf[1]) {
1403 for (
int i=0; i<
POW_DIM; i++) {
1404 tree->child[(bodyIndex + offset)*
POW_DIM + i] = -1;
1406 tree->count[bodyIndex + offset] = 1;
1408 particles->
x[bodyIndex + offset] = 0.;
1409 particles->
vx[bodyIndex + offset] = 0.;
1410 particles->
ax[bodyIndex + offset] = 0.;
1411 particles->
g_ax[bodyIndex + offset] = 0.;
1413 particles->
y[bodyIndex + offset] = 0.;
1414 particles->
vy[bodyIndex + offset] = 0.;
1415 particles->
ay[bodyIndex + offset] = 0.;
1416 particles->
g_ay[bodyIndex + offset] = 0.;
1418 particles->
z[bodyIndex + offset] = 0.;
1419 particles->
vz[bodyIndex + offset] = 0.;
1420 particles->
az[bodyIndex + offset] = 0.;
1421 particles->
g_az[bodyIndex + offset] = 0.;
1424 particles->
mass[bodyIndex + offset] = 0.;
1425 tree->start[bodyIndex + offset] = -1;
1426 tree->sorted[bodyIndex + offset] = 0;
1431 offset =
tree->toDeleteNode[0];
1433 while ((bodyIndex + offset) >=
tree->toDeleteNode[0] && (bodyIndex + offset) <
tree->toDeleteNode[1]) {
1434 for (
int i=0; i<
POW_DIM; i++) {
1435 tree->child[(bodyIndex + offset)*
POW_DIM + i] = -1;
1437 tree->count[bodyIndex + offset] = 0;
1438 particles->
x[bodyIndex + offset] = 0.;
1442 particles->
y[bodyIndex + offset] = 0.;
1446 particles->
z[bodyIndex + offset] = 0.;
1451 particles->
mass[bodyIndex + offset] = 0.;
1461 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1462 integer stride = blockDim.x * gridDim.x;
1467 while ((bodyIndex + offset) < bins) {
1469 helper->
keyTypeBuffer[bodyIndex + offset] = (bodyIndex + offset) * (max_key/bins);
1472 if ((bodyIndex + offset) == (bins - 1)) {
1483 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1484 integer stride = blockDim.x * gridDim.x;
1489 while ((bodyIndex + offset) < n) {
1491 key =
tree->getParticleKey(particles, bodyIndex + offset,
MAX_LEVEL, curveType);
1493 for (
int i = 0; i < (bins); i++) {
1494 if (key >= helper->
keyTypeBuffer[i] && key < helper->keyTypeBuffer[i + 1]) {
1511 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
1512 integer stride = blockDim.x * gridDim.x;
1518 while ((bodyIndex + offset) < (bins-1)) {
1521 for (
integer i=0; i<(bodyIndex+offset); i++) {
1526 if ((sum + helper->
integerBuffer[bodyIndex + offset]) >= (i*n) && sum < (i*n)) {
1539 integer *procParticleCounter) {
1542 numProcesses, range, procParticleCounter);
1560 subDomainKeyTree,
tree, particles,
1561 domainList, n, m, level);
1569 tree, particles, keys, maxLevel, n, curveType);
1576 subDomainKeyTree,
tree, particles, n, m, curveType);
1584 subDomainKeyTree,
tree, particles, n, m,
sortArray, curveType);
1593 template <
typename T>
1604 template <
typename T>
1659 subDomainKeyTree, helper, bins, n, curveType);
1666 bins, n, curveType);
1681 integer *relevantDomainListIndices,
integer *relevantDomainListLevels,
1682 integer *relevantDomainListProcess) :
1683 domainListIndices(domainListIndices), domainListLevels(domainListLevels),
1684 domainListIndex(domainListIndex), domainListCounter(domainListCounter),
1685 domainListKeys(domainListKeys), sortedDomainListKeys(sortedDomainListKeys),
1686 relevantDomainListIndices(relevantDomainListIndices),
1687 relevantDomainListLevels(relevantDomainListLevels),
1688 relevantDomainListProcess(relevantDomainListProcess) {
1698 keyType *sortedDomainListKeys,
integer *relevantDomainListIndices,
1699 integer *relevantDomainListLevels,
integer *relevantDomainListProcess) {
1725 keyType *sortedDomainListKeys,
integer *relevantDomainListIndices,
1726 integer *relevantDomainListLevels,
integer *relevantDomainListProcess) {
1728 domainList->
set(domainListIndices, domainListLevels, domainListIndex, domainListCounter, domainListKeys,
1729 sortedDomainListKeys, relevantDomainListIndices, relevantDomainListLevels,
1730 relevantDomainListProcess);
1734 domainList->
setBorders(borders, relevantDomainListOriginalIndex);
1739 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1740 integer stride = blockDim.x * gridDim.x;
1759 printf(
"domainListIndices[%i] = %i, level = %i, x = (%f, %f, %f) mass = %f\n", index + offset,
1760 domainListIndex, domainList->
domainListLevels[index + offset], particles->
x[domainListIndex],
1761 particles->
y[domainListIndex], particles->
z[domainListIndex],
1762 particles->
mass[domainListIndex]);
1779 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1780 integer stride = blockDim.x * gridDim.x;
1816 printf(
"domainListIndices[%i] = %i, x = (%f) mass = %f\n", index + offset,
1817 domainListIndex, particles->
x[domainListIndex], particles->
mass[domainListIndex]);
1819 printf(
"domainListIndices[%i] = %i, x = (%f, %f) mass = %f\n", index + offset,
1820 domainListIndex, particles->
x[domainListIndex],
1821 particles->
y[domainListIndex], particles->
mass[domainListIndex]);
1823 printf(
"domainListIndices[%i] = %i, x = (%f, %f, %f) mass = %f\n", index + offset,
1824 domainListIndex, particles->
x[domainListIndex],
1825 particles->
y[domainListIndex], particles->
z[domainListIndex], particles->
mass[domainListIndex]);
1965 integer index = threadIdx.x + blockIdx.x * blockDim.x;
1971 int domainListIndex;
1973 key2test = (
keyType)blockIdx.x << (
DIM * (maxLevel - 1));
1975 if (threadIdx.x == 0) {
1986 key2test += (
keyType)threadIdx.x << (
DIM * (maxLevel - 2));
1987 if (threadIdx.x == (
POW_DIM - 1) && blockIdx.x == (
POW_DIM - 1)) {
1991 keyMax = (shiftValue << toShift) - 1;
1995 keyMax = (shiftValue << toShift) - 1;
1999 keyMax = (shiftValue << toShift) - 1;
2004 keyMax = key2test + (1UL << (
DIM * (maxLevel - 2)));
2010 while (key2test < keyMax && level > 1) {
2011 if (subDomainKeyTree->
isDomainListNode(key2test & (~0UL << (
DIM * (maxLevel - level + 1))),
2012 maxLevel, level-1, curveType)) {
2022 if (subDomainKeyTree->
isDomainListNode(key2test, maxLevel, level, curveType)) {
2026 key2test = key2test + (1UL <<
DIM * (maxLevel - level));
2027 while (((key2test >> (
DIM * (maxLevel - level))) & (
keyType)(
POW_DIM - 1)) == 0UL) {
2043 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2044 integer stride = blockDim.x * gridDim.x;
2047 bool lowestDomainListNode;
2054 lowestDomainListNode =
true;
2058 for (
int i=0; i<
POW_DIM; i++) {
2059 childIndex =
tree->child[
POW_DIM * domainIndex + i];
2061 if (childIndex != -1) {
2063 if (childIndex >= n) {
2065 if (particles->
nodeType[childIndex] >= 1) {
2066 lowestDomainListNode =
false;
2077 if (!lowestDomainListNode) {
2084 if (lowestDomainListNode) {
2089 particles->
nodeType[domainIndex] = 2;
2117 keyType *sortedDomainListKeys,
integer *relevantDomainListIndices,
2118 integer *relevantDomainListLevels,
integer *relevantDomainListProcess) {
2121 domainListIndex, domainListCounter, domainListKeys, sortedDomainListKeys,
2122 relevantDomainListIndices, relevantDomainListLevels, relevantDomainListProcess);
2128 relevantDomainListOriginalIndex);
2146 domainList, maxLevel, curveType);
2162 real d,
int index) {
2168 particles->
y[index] * particles->
y[index]) < d) {
2170 if (
cuda::math::sqrt(particles->
x[index] * particles->
x[index] + particles->
y[index] * particles->
y[index] +
2171 particles->
z[index] * particles->
z[index]) < d) {
2180 real d,
int index) {
2201 int *particles2remove,
int *counter,
int criterion,
real d,
int numParticles) {
2203 int bodyIndex = threadIdx.x + blockDim.x * blockIdx.x;
2204 int stride = blockDim.x * gridDim.x;
2210 switch (criterion) {
2218 cudaTerminate(
"Criterion for removing particles not available! Exiting...\n");
2222 particles2remove[bodyIndex + offset] = 1;
2223 atomicAdd(counter, 1);
2225 particles2remove[bodyIndex + offset] = 0;
2234 int *particles2remove,
int *counter,
int criterion,
real d,
2248 template<
typename T,
typename U>
2251 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2252 integer stride = blockDim.x * gridDim.x;
2258 while ((index + offset) < length) {
2259 if (array[index + offset] != -1) {
2260 for (
integer i = 0; i < length; i++) {
2261 if (i != (index + offset)) {
2262 if (array[i] != -1 && (array[index + offset] == array[i] || (entry1[array[i]] == entry1[array[index + offset]] &&
2263 entry2[array[i]] == entry2[array[index + offset]] &&
2264 entry3[array[i]] == entry3[array[index + offset]]))) {
2268 printf(
"DUPLICATE: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
2269 array[index + offset], array[i],
2270 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
2271 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
2276 printf(
"DUPLICATE is domainList!\n");
2280 for (
int k=0; k<
POW_DIM; k++) {
2281 if (child[
POW_DIM*array[index + offset] + k] == array[i]) {
2282 printf(
"isChild: index = %i: child %i == i = %i\n", array[index + offset],
2287 if (child[8*array[i] + k] == array[index + offset]) {
2288 printf(
"isChild: index = %i: child %i == index = %i\n", array[i],
2289 k, array[index + offset]);
2295 printf(
"isChild: Duplicate NOT a child: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
2296 array[index + offset], array[i],
2297 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
2298 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
2324 template <
typename T,
unsigned int blockSize>
2327 extern __shared__ T sdata[];
2329 unsigned int tid = threadIdx.x;
2330 unsigned int i = blockIdx.x*(blockSize*2) + tid;
2331 unsigned int gridSize = blockSize*2*gridDim.x;
2337 sdata[tid] += array[i] + array[i+blockSize];
2343 if (blockSize >= 512) {
2345 array[tid] += array[tid + 256];
2349 if (blockSize >= 256) {
2351 array[tid] += array[tid + 128];
2355 if (blockSize >= 128) {
2357 array[tid] += array[tid + 64];
2364 if (blockSize >= 64) array[tid] += array[tid + 32];
2365 if (blockSize >= 32) array[tid] += array[tid + 16];
2366 if (blockSize >= 16) array[tid] += array[tid + 8];
2367 if (blockSize >= 8) array[tid] += array[tid + 4];
2368 if (blockSize >= 4) array[tid] += array[tid + 2];
2369 if (blockSize >= 2) array[tid] += array[tid + 1];
2373 outputData[blockIdx.x] = array[0];
2379 template <
typename T,
unsigned int blockSize>
2381 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2382 integer stride = blockDim.x * gridDim.x;
2387 while ((index + offset) < blockSize) {
2388 atomicAdd(&outdata[0], indata[index + offset]);
2396 template<
typename T,
typename U>
2399 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::markDuplicatesTemp<T, U>,
tree, domainList, array, entry1, entry2,
2400 entry3, duplicateCounter, child, length);
2404 template <
typename T,
unsigned int blockSize>
2406 ExecutionPolicy executionPolicy(256, blockSize, blockSize *
sizeof(T));
2407 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::reduceBlockwise<T, blockSize>, array, outputData, n);
2411 template <
typename T,
unsigned int blockSize>
2414 return cuda::launch(
true, executionPolicy, ::CudaUtils::Kernel::blockReduction<T, blockSize>,
2427 template <
unsigned int blockSize>
2430 extern __shared__
real sdata[];
2433 real *ly = &sdata[blockSize];
2435 real *lz = &sdata[2 * blockSize];
2438 unsigned int tid = threadIdx.x;
2439 unsigned int i = blockIdx.x*(blockSize*2) + tid;
2440 unsigned int gridSize = blockSize*2*gridDim.x;
2454 lx[tid] += particles->
mass[i] * (particles->
y[i]*particles->
vz[i] - particles->
z[i]*particles->
vy[i]) +
2455 particles->
mass[i+blockSize] * (particles->
y[i+blockSize]*particles->
vz[i+blockSize] - particles->
z[i+blockSize]*particles->
vy[i+blockSize]);
2457 ly[tid] += particles->
mass[i] * (particles->
z[i]*particles->
vx[i] - particles->
x[i]*particles->
vz[i]) +
2458 particles->
mass[i+blockSize] * (particles->
z[i+blockSize]*particles->
vx[i+blockSize] - particles->
x[i+blockSize]*particles->
vz[i+blockSize]);
2460 lz[tid] += particles->
mass[i] * (particles->
x[i]*particles->
vy[i] - particles->
y[i]*particles->
vx[i]) +
2461 particles->
mass[i+blockSize] * (particles->
x[i+blockSize]*particles->
vy[i+blockSize] - particles->
y[i+blockSize]*particles->
vx[i+blockSize]);
2469 if (blockSize >= 512) {
2471 lx[tid] += lx[tid + 256];
2473 ly[tid] += ly[tid + 256];
2475 lz[tid] += lz[tid + 256];
2481 if (blockSize >= 256) {
2483 lx[tid] += lx[tid + 128];
2485 ly[tid] += ly[tid + 128];
2487 lz[tid] += lz[tid + 128];
2493 if (blockSize >= 128) {
2495 lx[tid] += lx[tid + 64];
2497 ly[tid] += ly[tid + 64];
2499 lz[tid] += lz[tid + 64];
2508 if (blockSize >= 64) {
2509 lx[tid] += lx[tid + 32];
2511 ly[tid] += ly[tid + 32];
2513 lz[tid] += lz[tid + 32];
2517 if (blockSize >= 32) {
2518 lx[tid] += lx[tid + 16];
2520 ly[tid] += ly[tid + 16];
2522 lz[tid] += lz[tid + 16];
2526 if (blockSize >= 16) {
2527 lx[tid] += lx[tid + 8];
2529 ly[tid] += ly[tid + 8];
2531 lz[tid] += lz[tid + 8];
2535 if (blockSize >= 8) {
2536 lx[tid] += lx[tid + 4];
2538 ly[tid] += ly[tid + 4];
2540 lz[tid] += lz[tid + 4];
2544 if (blockSize >= 4) {
2545 lx[tid] += lx[tid + 2];
2547 ly[tid] += ly[tid + 2];
2549 lz[tid] += lz[tid + 2];
2553 if (blockSize >= 2) {
2554 lx[tid] += lx[tid + 1];
2556 ly[tid] += ly[tid + 1];
2558 lz[tid] += lz[tid + 1];
2565 outputData[blockIdx.x] = lx[0];
2567 outputData[blockSize + blockIdx.x] = ly[0];
2569 outputData[2 * blockSize + blockIdx.x] = lz[0];
2575 template <
unsigned int blockSize>
2578 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2579 integer stride = blockDim.x * gridDim.x;
2584 while ((index + offset) < blockSize) {
2585 atomicAdd(&outdata[0], indata[index + offset]);
2587 atomicAdd(&outdata[1], indata[blockSize + index + offset]);
2589 atomicAdd(&outdata[2], indata[2 * blockSize + index + offset]);
2599 integer index = threadIdx.x + blockIdx.x * blockDim.x;
2600 integer stride = blockDim.x * gridDim.x;
2604 while ((index + offset) < n) {
2606 vel =
abs(particles->
vx[index + offset]);
2609 particles->
vy[index + offset] * particles->
vy[index + offset]);
2612 particles->
vy[index + offset] * particles->
vy[index + offset] +
2613 particles->
vz[index + offset] * particles->
vz[index + offset]);
2616 particles->
u[index + offset] += 0.5 * particles->
mass[index + offset] * vel * vel;
2622 template <
unsigned int blockSize>
2625 return cuda::launch(
true, executionPolicy, ::Physics::Kernel::calculateAngularMomentumBlockwise<blockSize>,
2626 particles, outputData, n);
2630 template <
unsigned int blockSize>
2633 return cuda::launch(
true, executionPolicy, ::Physics::Kernel::sumAngularMomentum<blockSize>,
keyType * domainListKeys
domain list node keys
integer * domainListIndex
domain list node index, thus amount of domain list nodes
CUDA_CALLABLE_MEMBER void setBorders(real *borders, integer *relevantDomainListOriginalIndex)
keyType * sortedDomainListKeys
sorted domain list node keys, usable as output for sorting the keys
CUDA_CALLABLE_MEMBER DomainList()
Constructor.
integer * relevantDomainListOriginalIndex
integer * domainListIndices
domain list node indices
integer * domainListLevels
domain list node levels
integer * relevantDomainListIndices
concentrate domain list nodes, usable to reduce domain list indices in respect to some criterion
integer * domainListCounter
domain list node counter, usable as buffer
integer * relevantDomainListProcess
CUDA_CALLABLE_MEMBER void set(integer *domainListIndices, integer *domainListLevels, integer *domainListIndex, integer *domainListCounter, keyType *domainListKeys, keyType *sortedDomainListKeys, integer *relevantDomainListIndices, integer *relevantDomainListLevels, integer *relevantDomainListProcess)
Setter, passing pointer to member variables.
CUDA_CALLABLE_MEMBER ~DomainList()
Destructor.
integer * relevantDomainListLevels
Execution policy/instruction for CUDA kernel execution.
Particle(s) class based on SoA (Structur of Arrays).
real * x
(pointer to) x position (array)
integer * level
(pointer to) level of the (pseudo-)particles
real * g_ay
(pointer to) y gravitational acceleration (array)
real * ay
(pointer to) y acceleration (array)
integer * nodeType
(pointer to) node type
real * az
(pointer to) z acceleration (array)
real * y
(pointer to) y position (array)
real * ax
(pointer to) x acceleration (array)
real * u
energy (kinetic + gravitational for now)
real * g_ax
(pointer to) x gravitational acceleration (array)
real * mass
(pointer to) mass (array)
real * z
(pointer to) z position (array)
real * vz
(pointer to) z velocity (array)
real * g_az
(pointer to) z gravitational acceleration (array)
real * vx
(pointer to) x velocity (array)
real * vy
(pointer to) y velocity (array)
SubDomainKeyTree class handling rank, number of processes and ranges.
keyType * range
Space-filling curve ranges, mapping key ranges/borders to MPI processes.
CUDA_CALLABLE_MEMBER ~SubDomainKeyTree()
Destructor.
CUDA_CALLABLE_MEMBER integer key2proc(keyType key)
Compute particle's MPI process belonging by it's key.
CUDA_CALLABLE_MEMBER bool isDomainListNode(keyType key, integer maxLevel, integer level, Curve::Type curveType=Curve::lebesgue)
Check whether key, thus particle, represents a domain list node.
CUDA_CALLABLE_MEMBER void set(integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Setter.
integer * procParticleCounter
particle counter in dependence of MPI process(es)
CUDA_CALLABLE_MEMBER SubDomainKeyTree()
Default Constructor.
integer numProcesses
MPI number of processes.
#define cudaTerminate(...)
#define CUDA_CALLABLE_MEMBER
template real reduceBlockwise< real, 256 >(real *array, real *outputData, int n)
template real markDuplicatesTemp< integer, real >(Tree *tree, DomainList *domainList, integer *array, real *entry1, real *entry2, real *entry3, integer *duplicateCounter, integer *child, int length)
template real blockReduction< real, 256 >(const real *indata, real *outdata)
__global__ void markDuplicatesTemp(Tree *tree, DomainList *domainList, T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length)
__global__ void reduceBlockwise(T *array, T *outputData, int n)
__global__ void blockReduction(const T *indata, T *outdata)
__global__ void lowestDomainList(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, integer n, integer m)
Kernel to create the lowest domain list.
__global__ void setBorders(DomainList *domainList, real *borders, integer *relevantDomainListOriginalIndex)
__global__ void createDomainList(SubDomainKeyTree *subDomainKeyTree, DomainList *domainList, integer maxLevel, Curve::Type curveType=Curve::lebesgue)
Kernel to create the domain list.
real sortArray(A *arrayToSort, A *sortedArray, B *keyIn, B *keyOut, integer n)
CUDA_CALLABLE_MEMBER integer key2proc(keyType key, SubDomainKeyTree *subDomainKeyTree)
Convert the key to the corresponding process.
CUDA_CALLABLE_MEMBER keyType lebesgue2hilbert(keyType lebesgue, integer maxLevel)
Convert a Lebesgue key to a Hilbert key.
CUDA_CALLABLE_MEMBER void key2Char(keyType key, integer maxLevel, char *keyAsChar)
Convert a key to a char for printing.
__global__ void info(Material *material)
Debug kernel giving information about material(s).
__global__ void test(Particles *particles)
__global__ void mark2remove(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, int *particles2remove, int *counter, int criterion, real d, int numParticles)
Particle class related functions and kernels.
__device__ bool applySphericalCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, real d, int index)
Check whether particle(s) are within sphere from simulation center.
__device__ bool applyCubicCriterion(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, real d, int index)
Check whether particle(s) are within cube from simulation center.
template real sumAngularMomentum< 256 >(const real *indata, real *outdata)
template real calculateAngularMomentumBlockwise< 256 >(Particles *particles, real *outputData, int n)
__global__ void calculateAngularMomentumBlockwise(Particles *particles, real *outputData, int n)
Calculate angular momentum for all particles (per block).
__global__ void kineticEnergy(Particles *particles, int n)
Calculate kinetic energy.
__global__ void sumAngularMomentum(const real *indata, real *outdata)
Calculate angular momentum: sum over blocks.
Physics related functions and kernels.
const char *const repairTree
const char *const numParticles
real compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n)
Wrapper for SubDomainKeyTreeNS::Kernel::compLocalPseudoParticles().
real particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::particlesPerProcess().
real updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::updateLowestDomainListNodes().
real compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int level)
Wrapper for SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticlesPerLevel().
real compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n)
Wrapper for SubDomainKeyTreeNS::Kernel::compDomainListPseudoParticles().
real prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Wrapper for SubDomainKeyTreeNS::Kernel::prepareLowestDomainExchange().
real zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::zeroDomainListNodes().
real buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m)
Wrapper for SubDomainKeyTreeNS::Kernel::buildDomainTree().
real createKeyHistRanges(Helper *helper, integer bins)
real repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m, Curve::Type curveType)
Wrapper for SubDomainKeyTreeNS::Kernel::repairTree().
real getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Wrapper for SubDomainKeyTreeNS::Kernel::getParticleKeys().
real keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
real markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, integer *sortArray, Curve::Type curveType=Curve::lebesgue)
Wrapper for ::SubDomainKeyTreeNS::Kernel::markParticlesPerProcess().
real compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList)
Wrapper for SubDomainKeyTreeNS::Kernel::compLowestDomainListNodes().
void set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Wrapper for SubDomainKeyTreeNS::Kernel::set().
void test(SubDomainKeyTree *subDomainKeyTree)
Wrapper for SubDomainKeyTreeNS::Kernel::test().
real calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
__global__ void markParticlesProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, integer *sortArray, Curve::Type curveType=Curve::lebesgue)
Kernel to mark particle's belonging.
__global__ void test(SubDomainKeyTree *subDomainKeyTree)
Test kernel call (for debugging/testing purposes).
__global__ void compDomainListPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n)
__global__ void getParticleKeys(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, keyType *keys, integer maxLevel, integer n, Curve::Type curveType=Curve::lebesgue)
Kernel to get all particle keys (and additional information for debugging purposes).
__global__ void repairTree(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int m, Curve::Type curveType)
Repair tree by removing received and inserted (pseudo-)particles.
__global__ void set(SubDomainKeyTree *subDomainKeyTree, integer rank, integer numProcesses, keyType *range, integer *procParticleCounter)
Kernel call to setter.
__global__ void compLowestDomainListNodes(Tree *tree, Particles *particles, DomainList *lowestDomainList)
Compute/Find lowest domain list nodes.
__global__ void prepareLowestDomainExchange(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Prepare lowest domain exchange via MPI by copying to contiguous memory.
__global__ void updateLowestDomainListNodes(Particles *particles, DomainList *lowestDomainList, T *buffer, Entry::Name entry)
Update lowest domain list nodes.
__global__ void particlesPerProcess(SubDomainKeyTree *subDomainKeyTree, Tree *tree, Particles *particles, integer n, integer m, Curve::Type curveType=Curve::lebesgue)
Kernel to check particle's belonging and count in dependence of belonging/process.
__global__ void keyHistCounter(Tree *tree, Particles *particles, SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
__global__ void compLocalPseudoParticles(Tree *tree, Particles *particles, DomainList *domainList, int n)
Compute local (tree) pseudo particles.
__global__ void calculateNewRange(SubDomainKeyTree *subDomainKeyTree, Helper *helper, int bins, int n, Curve::Type curveType=Curve::lebesgue)
__global__ void compDomainListPseudoParticlesPerLevel(Tree *tree, Particles *particles, DomainList *domainList, DomainList *lowestDomainList, int n, int level)
Compute domain list pseudo particles (per level).
__global__ void buildDomainTree(Tree *tree, Particles *particles, DomainList *domainList, integer n, integer m)
Kernel to build the domain tree.
__global__ void zeroDomainListNodes(Particles *particles, DomainList *domainList, DomainList *lowestDomainList)
Zero domain list nodes.
__global__ void createKeyHistRanges(Helper *helper, integer bins)
SubDomainKeyTree related functions and kernels.
__device__ real sqrt(real a)
Square root of a floating point value.
__device__ real abs(real a)
Absolute value of a floating point value.
void set(T *d_var, T val, std::size_t count=1)
Set device memory to a specific value.
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
#define DIM
Dimension of the problem.